gcc/tree-vect-loop.cc

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2023 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #define INCLUDE_ALGORITHM
  23 #include "config.h"
  24 #include "system.h"
  25 #include "coretypes.h"
  26 #include "backend.h"
  27 #include "target.h"
  28 #include "rtl.h"
  29 #include "tree.h"
  30 #include "gimple.h"
  31 #include "cfghooks.h"
  32 #include "tree-pass.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "memmodel.h"
  36 #include "optabs.h"
  37 #include "diagnostic-core.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "cfganal.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop-niter.h"
  47 #include "tree-ssa-loop.h"
  48 #include "cfgloop.h"
  49 #include "tree-scalar-evolution.h"
  50 #include "tree-vectorizer.h"
  51 #include "gimple-fold.h"
  52 #include "cgraph.h"
  53 #include "tree-cfg.h"
  54 #include "tree-if-conv.h"
  55 #include "internal-fn.h"
  56 #include "tree-vector-builder.h"
  57 #include "vec-perm-indices.h"
  58 #include "tree-eh.h"
  59 #include "case-cfn-macros.h"
  60 #include "langhooks.h"
  61
  62 /* Loop Vectorization Pass.
  63
  64    This pass tries to vectorize loops.
  65
  66    For example, the vectorizer transforms the following simple loop:
  67
  68         short a[N]; short b[N]; short c[N]; int i;
  69
  70         for (i=0; i<N; i++){
  71           a[i] = b[i] + c[i];
  72         }
  73
  74    as if it was manually vectorized by rewriting the source code into:
  75
  76         typedef int __attribute__((mode(V8HI))) v8hi;
  77         short a[N];  short b[N]; short c[N];   int i;
  78         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  79         v8hi va, vb, vc;
  80
  81         for (i=0; i<N/8; i++){
  82           vb = pb[i];
  83           vc = pc[i];
  84           va = vb + vc;
  85           pa[i] = va;
  86         }
  87
  88         The main entry to this pass is vectorize_loops(), in which
  89    the vectorizer applies a set of analyses on a given set of loops,
  90    followed by the actual vectorization transformation for the loops that
  91    had successfully passed the analysis phase.
  92         Throughout this pass we make a distinction between two types of
  93    data: scalars (which are represented by SSA_NAMES), and memory references
  94    ("data-refs").  These two types of data require different handling both
  95    during analysis and transformation. The types of data-refs that the
  96    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  97    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  98    accesses are required to have a simple (consecutive) access pattern.
  99
 100    Analysis phase:
 101    ===============
 102         The driver for the analysis phase is vect_analyze_loop().
 103    It applies a set of analyses, some of which rely on the scalar evolution
 104    analyzer (scev) developed by Sebastian Pop.
 105
 106         During the analysis phase the vectorizer records some information
 107    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 108    loop, as well as general information about the loop as a whole, which is
 109    recorded in a "loop_vec_info" struct attached to each loop.
 110
 111    Transformation phase:
 112    =====================
 113         The loop transformation phase scans all the stmts in the loop, and
 114    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 115    the loop that needs to be vectorized.  It inserts the vector code sequence
 116    just before the scalar stmt S, and records a pointer to the vector code
 117    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 118    attached to S).  This pointer will be used for the vectorization of following
 119    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 120    otherwise, we rely on dead code elimination for removing it.
 121
 122         For example, say stmt S1 was vectorized into stmt VS1:
 123
 124    VS1: vb = px[i];
 125    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 126    S2:  a = b;
 127
 128    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 129    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 130    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 131    resulting sequence would be:
 132
 133    VS1: vb = px[i];
 134    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 135    VS2: va = vb;
 136    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 137
 138         Operands that are not SSA_NAMEs, are data-refs that appear in
 139    load/store operations (like 'x[i]' in S1), and are handled differently.
 140
 141    Target modeling:
 142    =================
 143         Currently the only target specific information that is used is the
 144    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 145    Targets that can support different sizes of vectors, for now will need
 146    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 147    flexibility will be added in the future.
 148
 149         Since we only vectorize operations which vector form can be
 150    expressed using existing tree codes, to verify that an operation is
 151    supported, the vectorizer checks the relevant optab at the relevant
 152    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 153    the value found is CODE_FOR_nothing, then there's no target support, and
 154    we can't vectorize the stmt.
 155
 156    For additional information on this project see:
 157    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 158 */
 159
 160 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
 161                                                 unsigned *);
 162 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
 163                                                bool *, bool *, bool);
 164
 165 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 166    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 167    may already be set for general statements (not just data refs).  */
 168
 169 static opt_result
 170 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
 171                               bool vectype_maybe_set_p,
 172                               poly_uint64 *vf)
 173 {
 174   gimple *stmt = stmt_info->stmt;
 175
 176   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 177        && !STMT_VINFO_LIVE_P (stmt_info))
 178       || gimple_clobber_p (stmt))
 179     {
 180       if (dump_enabled_p ())
 181         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 182       return opt_result::success ();
 183     }
 184
 185   tree stmt_vectype, nunits_vectype;
 186   opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
 187                                                    &stmt_vectype,
 188                                                    &nunits_vectype);
 189   if (!res)
 190     return res;
 191
 192   if (stmt_vectype)
 193     {
 194       if (STMT_VINFO_VECTYPE (stmt_info))
 195         /* The only case when a vectype had been already set is for stmts
 196            that contain a data ref, or for "pattern-stmts" (stmts generated
 197            by the vectorizer to represent/replace a certain idiom).  */
 198         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 199                      || vectype_maybe_set_p)
 200                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 201       else
 202         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 203     }
 204
 205   if (nunits_vectype)
 206     vect_update_max_nunits (vf, nunits_vectype);
 207
 208   return opt_result::success ();
 209 }
 210
 211 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 212    types of STMT_INFO and all attached pattern statements and update
 213    the vectorization factor VF accordingly.  Return true on success
 214    or false if something prevented vectorization.  */
 215
 216 static opt_result
 217 vect_determine_vf_for_stmt (vec_info *vinfo,
 218                             stmt_vec_info stmt_info, poly_uint64 *vf)
 219 {
 220   if (dump_enabled_p ())
 221     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
 222                      stmt_info->stmt);
 223   opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
 224   if (!res)
 225     return res;
 226
 227   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 228       && STMT_VINFO_RELATED_STMT (stmt_info))
 229     {
 230       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 231       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
 232
 233       /* If a pattern statement has def stmts, analyze them too.  */
 234       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 235            !gsi_end_p (si); gsi_next (&si))
 236         {
 237           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 238           if (dump_enabled_p ())
 239             dump_printf_loc (MSG_NOTE, vect_location,
 240                              "==> examining pattern def stmt: %G",
 241                              def_stmt_info->stmt);
 242           res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
 243           if (!res)
 244             return res;
 245         }
 246
 247       if (dump_enabled_p ())
 248         dump_printf_loc (MSG_NOTE, vect_location,
 249                          "==> examining pattern statement: %G",
 250                          stmt_info->stmt);
 251       res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
 252       if (!res)
 253         return res;
 254     }
 255
 256   return opt_result::success ();
 257 }
 258
 259 /* Function vect_determine_vectorization_factor
 260
 261    Determine the vectorization factor (VF).  VF is the number of data elements
 262    that are operated upon in parallel in a single iteration of the vectorized
 263    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 264    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 265    elements can fit in a single vector register.
 266
 267    We currently support vectorization of loops in which all types operated upon
 268    are of the same size.  Therefore this function currently sets VF according to
 269    the size of the types operated upon, and fails if there are multiple sizes
 270    in the loop.
 271
 272    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 273    original loop:
 274         for (i=0; i<N; i++){
 275           a[i] = b[i] + c[i];
 276         }
 277
 278    vectorized loop:
 279         for (i=0; i<N; i+=VF){
 280           a[i:VF] = b[i:VF] + c[i:VF];
 281         }
 282 */
 283
 284 static opt_result
 285 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 286 {
 287   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 288   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 289   unsigned nbbs = loop->num_nodes;
 290   poly_uint64 vectorization_factor = 1;
 291   tree scalar_type = NULL_TREE;
 292   gphi *phi;
 293   tree vectype;
 294   stmt_vec_info stmt_info;
 295   unsigned i;
 296
 297   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 298
 299   for (i = 0; i < nbbs; i++)
 300     {
 301       basic_block bb = bbs[i];
 302
 303       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 304            gsi_next (&si))
 305         {
 306           phi = si.phi ();
 307           stmt_info = loop_vinfo->lookup_stmt (phi);
 308           if (dump_enabled_p ())
 309             dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
 310                              (gimple *) phi);
 311
 312           gcc_assert (stmt_info);
 313
 314           if (STMT_VINFO_RELEVANT_P (stmt_info)
 315               || STMT_VINFO_LIVE_P (stmt_info))
 316             {
 317               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 318               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 319
 320               if (dump_enabled_p ())
 321                 dump_printf_loc (MSG_NOTE, vect_location,
 322                                  "get vectype for scalar type:  %T\n",
 323                                  scalar_type);
 324
 325               vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 326               if (!vectype)
 327                 return opt_result::failure_at (phi,
 328                                                "not vectorized: unsupported "
 329                                                "data-type %T\n",
 330                                                scalar_type);
 331               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 332
 333               if (dump_enabled_p ())
 334                 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
 335                                  vectype);
 336
 337               if (dump_enabled_p ())
 338                 {
 339                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 340                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 341                   dump_printf (MSG_NOTE, "\n");
 342                 }
 343
 344               vect_update_max_nunits (&vectorization_factor, vectype);
 345             }
 346         }
 347
 348       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 349            gsi_next (&si))
 350         {
 351           if (is_gimple_debug (gsi_stmt (si)))
 352             continue;
 353           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 354           opt_result res
 355             = vect_determine_vf_for_stmt (loop_vinfo,
 356                                           stmt_info, &vectorization_factor);
 357           if (!res)
 358             return res;
 359         }
 360     }
 361
 362   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 363   if (dump_enabled_p ())
 364     {
 365       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 366       dump_dec (MSG_NOTE, vectorization_factor);
 367       dump_printf (MSG_NOTE, "\n");
 368     }
 369
 370   if (known_le (vectorization_factor, 1U))
 371     return opt_result::failure_at (vect_location,
 372                                    "not vectorized: unsupported data-type\n");
 373   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 374   return opt_result::success ();
 375 }
 376
 377
 378 /* Function vect_is_simple_iv_evolution.
 379
 380    FORNOW: A simple evolution of an induction variables in the loop is
 381    considered a polynomial evolution.  */
 382
 383 static bool
 384 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 385                              tree * step)
 386 {
 387   tree init_expr;
 388   tree step_expr;
 389   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 390   basic_block bb;
 391
 392   /* When there is no evolution in this loop, the evolution function
 393      is not "simple".  */
 394   if (evolution_part == NULL_TREE)
 395     return false;
 396
 397   /* When the evolution is a polynomial of degree >= 2
 398      the evolution function is not "simple".  */
 399   if (tree_is_chrec (evolution_part))
 400     return false;
 401
 402   step_expr = evolution_part;
 403   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 404
 405   if (dump_enabled_p ())
 406     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
 407                      step_expr, init_expr);
 408
 409   *init = init_expr;
 410   *step = step_expr;
 411
 412   if (TREE_CODE (step_expr) != INTEGER_CST
 413       && (TREE_CODE (step_expr) != SSA_NAME
 414           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 415               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 416           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 417               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 418                   || !flag_associative_math)))
 419       && (TREE_CODE (step_expr) != REAL_CST
 420           || !flag_associative_math))
 421     {
 422       if (dump_enabled_p ())
 423         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 424                          "step unknown.\n");
 425       return false;
 426     }
 427
 428   return true;
 429 }
 430
 431 /* Function vect_is_nonlinear_iv_evolution
 432
 433    Only support nonlinear induction for integer type
 434    1. neg
 435    2. mul by constant
 436    3. lshift/rshift by constant.
 437
 438    For neg induction, return a fake step as integer -1.  */
 439 static bool
 440 vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
 441                                 gphi* loop_phi_node, tree *init, tree *step)
 442 {
 443   tree init_expr, ev_expr, result, op1, op2;
 444   gimple* def;
 445
 446   if (gimple_phi_num_args (loop_phi_node) != 2)
 447     return false;
 448
 449   init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop));
 450   ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop));
 451
 452   /* Support nonlinear induction only for integer type.  */
 453   if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
 454     return false;
 455
 456   *init = init_expr;
 457   result = PHI_RESULT (loop_phi_node);
 458
 459   if (TREE_CODE (ev_expr) != SSA_NAME
 460       || ((def = SSA_NAME_DEF_STMT (ev_expr)), false)
 461       || !is_gimple_assign (def))
 462     return false;
 463
 464   enum tree_code t_code = gimple_assign_rhs_code (def);
 465   switch (t_code)
 466     {
 467     case NEGATE_EXPR:
 468       if (gimple_assign_rhs1 (def) != result)
 469         return false;
 470       *step = build_int_cst (TREE_TYPE (init_expr), -1);
 471       STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
 472       break;
 473
 474     case RSHIFT_EXPR:
 475     case LSHIFT_EXPR:
 476     case MULT_EXPR:
 477       op1 = gimple_assign_rhs1 (def);
 478       op2 = gimple_assign_rhs2 (def);
 479       if (TREE_CODE (op2) != INTEGER_CST
 480           || op1 != result)
 481         return false;
 482       *step = op2;
 483       if (t_code == LSHIFT_EXPR)
 484         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
 485       else if (t_code == RSHIFT_EXPR)
 486         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
 487       /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul.  */
 488       else
 489         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
 490       break;
 491
 492     default:
 493       return false;
 494     }
 495
 496   STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = *init;
 497   STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = *step;
 498
 499   return true;
 500 }
 501
 502 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
 503    what we are assuming is a double reduction.  For example, given
 504    a structure like this:
 505
 506       outer1:
 507         x_1 = PHI <x_4(outer2), ...>;
 508         ...
 509
 510       inner:
 511         x_2 = PHI <x_1(outer1), ...>;
 512         ...
 513         x_3 = ...;
 514         ...
 515
 516       outer2:
 517         x_4 = PHI <x_3(inner)>;
 518         ...
 519
 520    outer loop analysis would treat x_1 as a double reduction phi and
 521    this function would then return true for x_2.  */
 522
 523 static bool
 524 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
 525 {
 526   use_operand_p use_p;
 527   ssa_op_iter op_iter;
 528   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
 529     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
 530       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
 531         return true;
 532   return false;
 533 }
 534
 535 /* Returns true if Phi is a first-order recurrence. A first-order
 536    recurrence is a non-reduction recurrence relation in which the value of
 537    the recurrence in the current loop iteration equals a value defined in
 538    the previous iteration.  */
 539
 540 static bool
 541 vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop,
 542                                    gphi *phi)
 543 {
 544   /* A nested cycle isn't vectorizable as first order recurrence.  */
 545   if (LOOP_VINFO_LOOP (loop_vinfo) != loop)
 546     return false;
 547
 548   /* Ensure the loop latch definition is from within the loop.  */
 549   edge latch = loop_latch_edge (loop);
 550   tree ldef = PHI_ARG_DEF_FROM_EDGE (phi, latch);
 551   if (TREE_CODE (ldef) != SSA_NAME
 552       || SSA_NAME_IS_DEFAULT_DEF (ldef)
 553       || is_a <gphi *> (SSA_NAME_DEF_STMT (ldef))
 554       || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (ldef))))
 555     return false;
 556
 557   tree def = gimple_phi_result (phi);
 558
 559   /* Ensure every use_stmt of the phi node is dominated by the latch
 560      definition.  */
 561   imm_use_iterator imm_iter;
 562   use_operand_p use_p;
 563   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
 564     if (!is_gimple_debug (USE_STMT (use_p))
 565         && (SSA_NAME_DEF_STMT (ldef) == USE_STMT (use_p)
 566             || !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (ldef),
 567                                             USE_STMT (use_p))))
 568       return false;
 569
 570   /* First-order recurrence autovectorization needs shuffle vector.  */
 571   tree scalar_type = TREE_TYPE (def);
 572   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 573   if (!vectype)
 574     return false;
 575
 576   return true;
 577 }
 578
 579 /* Function vect_analyze_scalar_cycles_1.
 580
 581    Examine the cross iteration def-use cycles of scalar variables
 582    in LOOP.  LOOP_VINFO represents the loop that is now being
 583    considered for vectorization (can be LOOP, or an outer-loop
 584    enclosing LOOP).  SLP indicates there will be some subsequent
 585    slp analyses or not.  */
 586
 587 static void
 588 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop,
 589                               bool slp)
 590 {
 591   basic_block bb = loop->header;
 592   tree init, step;
 593   auto_vec<stmt_vec_info, 64> worklist;
 594   gphi_iterator gsi;
 595   bool double_reduc, reduc_chain;
 596
 597   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 598
 599   /* First - identify all inductions.  Reduction detection assumes that all the
 600      inductions have been identified, therefore, this order must not be
 601      changed.  */
 602   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 603     {
 604       gphi *phi = gsi.phi ();
 605       tree access_fn = NULL;
 606       tree def = PHI_RESULT (phi);
 607       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 608
 609       if (dump_enabled_p ())
 610         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
 611                          (gimple *) phi);
 612
 613       /* Skip virtual phi's.  The data dependences that are associated with
 614          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 615       if (virtual_operand_p (def))
 616         continue;
 617
 618       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 619
 620       /* Analyze the evolution function.  */
 621       access_fn = analyze_scalar_evolution (loop, def);
 622       if (access_fn)
 623         {
 624           STRIP_NOPS (access_fn);
 625           if (dump_enabled_p ())
 626             dump_printf_loc (MSG_NOTE, vect_location,
 627                              "Access function of PHI: %T\n", access_fn);
 628           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 629             = initial_condition_in_loop_num (access_fn, loop->num);
 630           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 631             = evolution_part_in_loop_num (access_fn, loop->num);
 632         }
 633
 634       if ((!access_fn
 635            || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
 636            || !vect_is_simple_iv_evolution (loop->num, access_fn,
 637                                             &init, &step)
 638            || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 639                && TREE_CODE (step) != INTEGER_CST))
 640           /* Only handle nonlinear iv for same loop.  */
 641           && (LOOP_VINFO_LOOP (loop_vinfo) != loop
 642               || !vect_is_nonlinear_iv_evolution (loop, stmt_vinfo,
 643                                                   phi, &init, &step)))
 644         {
 645           worklist.safe_push (stmt_vinfo);
 646           continue;
 647         }
 648
 649       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 650                   != NULL_TREE);
 651       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 652
 653       if (dump_enabled_p ())
 654         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 655       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 656     }
 657
 658
 659   /* Second - identify all reductions and nested cycles.  */
 660   while (worklist.length () > 0)
 661     {
 662       stmt_vec_info stmt_vinfo = worklist.pop ();
 663       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
 664       tree def = PHI_RESULT (phi);
 665
 666       if (dump_enabled_p ())
 667         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
 668                          (gimple *) phi);
 669
 670       gcc_assert (!virtual_operand_p (def)
 671                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 672
 673       stmt_vec_info reduc_stmt_info
 674         = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
 675                                     &reduc_chain, slp);
 676       if (reduc_stmt_info)
 677         {
 678           STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
 679           STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
 680           if (double_reduc)
 681             {
 682               if (dump_enabled_p ())
 683                 dump_printf_loc (MSG_NOTE, vect_location,
 684                                  "Detected double reduction.\n");
 685
 686               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 687               STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
 688             }
 689           else
 690             {
 691               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 692                 {
 693                   if (dump_enabled_p ())
 694                     dump_printf_loc (MSG_NOTE, vect_location,
 695                                      "Detected vectorizable nested cycle.\n");
 696
 697                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 698                 }
 699               else
 700                 {
 701                   if (dump_enabled_p ())
 702                     dump_printf_loc (MSG_NOTE, vect_location,
 703                                      "Detected reduction.\n");
 704
 705                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 706                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
 707                   /* Store the reduction cycles for possible vectorization in
 708                      loop-aware SLP if it was not detected as reduction
 709                      chain.  */
 710                   if (! reduc_chain)
 711                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
 712                       (reduc_stmt_info);
 713                 }
 714             }
 715         }
 716       else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi))
 717         STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence;
 718       else
 719         if (dump_enabled_p ())
 720           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 721                            "Unknown def-use cycle pattern.\n");
 722     }
 723 }
 724
 725
 726 /* Function vect_analyze_scalar_cycles.
 727
 728    Examine the cross iteration def-use cycles of scalar variables, by
 729    analyzing the loop-header PHIs of scalar variables.  Classify each
 730    cycle as one of the following: invariant, induction, reduction, unknown.
 731    We do that for the loop represented by LOOP_VINFO, and also to its
 732    inner-loop, if exists.
 733    Examples for scalar cycles:
 734
 735    Example1: reduction:
 736
 737               loop1:
 738               for (i=0; i<N; i++)
 739                  sum += a[i];
 740
 741    Example2: induction:
 742
 743               loop2:
 744               for (i=0; i<N; i++)
 745                  a[i] = i;  */
 746
 747 static void
 748 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo, bool slp)
 749 {
 750   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 751
 752   vect_analyze_scalar_cycles_1 (loop_vinfo, loop, slp);
 753
 754   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 755      Reductions in such inner-loop therefore have different properties than
 756      the reductions in the nest that gets vectorized:
 757      1. When vectorized, they are executed in the same order as in the original
 758         scalar loop, so we can't change the order of computation when
 759         vectorizing them.
 760      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 761         current checks are too strict.  */
 762
 763   if (loop->inner)
 764     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner, slp);
 765 }
 766
 767 /* Transfer group and reduction information from STMT_INFO to its
 768    pattern stmt.  */
 769
 770 static void
 771 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
 772 {
 773   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
 774   stmt_vec_info stmtp;
 775   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
 776               && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
 777   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
 778   do
 779     {
 780       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
 781       gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
 782                            == STMT_VINFO_DEF_TYPE (stmt_info));
 783       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
 784       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
 785       if (stmt_info)
 786         REDUC_GROUP_NEXT_ELEMENT (stmtp)
 787           = STMT_VINFO_RELATED_STMT (stmt_info);
 788     }
 789   while (stmt_info);
 790 }
 791
 792 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 793
 794 static void
 795 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 796 {
 797   stmt_vec_info first;
 798   unsigned i;
 799
 800   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 801     {
 802       stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
 803       while (next)
 804         {
 805           if ((STMT_VINFO_IN_PATTERN_P (next)
 806                != STMT_VINFO_IN_PATTERN_P (first))
 807               || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
 808             break;
 809           next = REDUC_GROUP_NEXT_ELEMENT (next);
 810         }
 811       /* If all reduction chain members are well-formed patterns adjust
 812          the group to group the pattern stmts instead.  */
 813       if (! next
 814           && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
 815         {
 816           if (STMT_VINFO_IN_PATTERN_P (first))
 817             {
 818               vect_fixup_reduc_chain (first);
 819               LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 820                 = STMT_VINFO_RELATED_STMT (first);
 821             }
 822         }
 823       /* If not all stmt in the chain are patterns or if we failed
 824          to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
 825          it as regular reduction instead.  */
 826       else
 827         {
 828           stmt_vec_info vinfo = first;
 829           stmt_vec_info last = NULL;
 830           while (vinfo)
 831             {
 832               next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
 833               REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
 834               REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
 835               last = vinfo;
 836               vinfo = next;
 837             }
 838           STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
 839             = vect_internal_def;
 840           loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
 841           LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
 842           --i;
 843         }
 844     }
 845 }
 846
 847 /* Function vect_get_loop_niters.
 848
 849    Determine how many iterations the loop is executed and place it
 850    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 851    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 852    niter information holds in ASSUMPTIONS.
 853
 854    Return the loop exit conditions.  */
 855
 856
 857 static vec<gcond *>
 858 vect_get_loop_niters (class loop *loop, const_edge main_exit, tree *assumptions,
 859                       tree *number_of_iterations, tree *number_of_iterationsm1)
 860 {
 861   auto_vec<edge> exits = get_loop_exit_edges (loop);
 862   vec<gcond *> conds;
 863   conds.create (exits.length ());
 864   class tree_niter_desc niter_desc;
 865   tree niter_assumptions, niter, may_be_zero;
 866
 867   *assumptions = boolean_true_node;
 868   *number_of_iterationsm1 = chrec_dont_know;
 869   *number_of_iterations = chrec_dont_know;
 870
 871   DUMP_VECT_SCOPE ("get_loop_niters");
 872
 873   if (exits.is_empty ())
 874     return conds;
 875
 876   if (dump_enabled_p ())
 877     dump_printf_loc (MSG_NOTE, vect_location, "Loop has %d exits.\n",
 878                      exits.length ());
 879
 880   edge exit;
 881   unsigned int i;
 882   FOR_EACH_VEC_ELT (exits, i, exit)
 883     {
 884       gcond *cond = get_loop_exit_condition (exit);
 885       if (cond)
 886         conds.safe_push (cond);
 887
 888       if (dump_enabled_p ())
 889         dump_printf_loc (MSG_NOTE, vect_location, "Analyzing exit %d...\n", i);
 890
 891       if (exit != main_exit)
 892         continue;
 893
 894       may_be_zero = NULL_TREE;
 895       if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 896           || chrec_contains_undetermined (niter_desc.niter))
 897         continue;
 898
 899       niter_assumptions = niter_desc.assumptions;
 900       may_be_zero = niter_desc.may_be_zero;
 901       niter = niter_desc.niter;
 902
 903       if (may_be_zero && integer_zerop (may_be_zero))
 904         may_be_zero = NULL_TREE;
 905
 906       if (may_be_zero)
 907         {
 908           if (COMPARISON_CLASS_P (may_be_zero))
 909             {
 910               /* Try to combine may_be_zero with assumptions, this can simplify
 911                  computation of niter expression.  */
 912               if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 913                 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 914                                                  niter_assumptions,
 915                                                  fold_build1 (TRUTH_NOT_EXPR,
 916                                                               boolean_type_node,
 917                                                               may_be_zero));
 918               else
 919                 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 920                                      build_int_cst (TREE_TYPE (niter), 0),
 921                                      rewrite_to_non_trapping_overflow (niter));
 922
 923               may_be_zero = NULL_TREE;
 924             }
 925           else if (integer_nonzerop (may_be_zero))
 926             {
 927               *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 928               *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 929               continue;
 930             }
 931           else
 932             continue;
 933        }
 934
 935       /* Loop assumptions are based off the normal exit.  */
 936       *assumptions = niter_assumptions;
 937       *number_of_iterationsm1 = niter;
 938
 939       /* We want the number of loop header executions which is the number
 940          of latch executions plus one.
 941          ???  For UINT_MAX latch executions this number overflows to zero
 942          for loops like do { n++; } while (n != 0);  */
 943       if (niter && !chrec_contains_undetermined (niter))
 944           niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter),
 945                                unshare_expr (niter),
 946                                build_int_cst (TREE_TYPE (niter), 1));
 947       *number_of_iterations = niter;
 948     }
 949
 950   if (dump_enabled_p ())
 951     dump_printf_loc (MSG_NOTE, vect_location, "All loop exits successfully analyzed.\n");
 952
 953   return conds;
 954 }
 955
 956 /*  Determine the main loop exit for the vectorizer.  */
 957
 958 edge
 959 vec_init_loop_exit_info (class loop *loop)
 960 {
 961   /* Before we begin we must first determine which exit is the main one and
 962      which are auxilary exits.  */
 963   auto_vec<edge> exits = get_loop_exit_edges (loop);
 964   if (exits.length () == 1)
 965     return exits[0];
 966
 967   /* If we have multiple exits we only support counting IV at the moment.  Analyze
 968      all exits and return one */
 969   class tree_niter_desc niter_desc;
 970   edge candidate = NULL;
 971   for (edge exit : exits)
 972     {
 973       if (!get_loop_exit_condition (exit))
 974         continue;
 975
 976       if (number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 977           && !chrec_contains_undetermined (niter_desc.niter))
 978         {
 979           if (!niter_desc.may_be_zero || !candidate)
 980             candidate = exit;
 981         }
 982     }
 983
 984   return candidate;
 985 }
 986
 987 /* Function bb_in_loop_p
 988
 989    Used as predicate for dfs order traversal of the loop bbs.  */
 990
 991 static bool
 992 bb_in_loop_p (const_basic_block bb, const void *data)
 993 {
 994   const class loop *const loop = (const class loop *)data;
 995   if (flow_bb_inside_loop_p (loop, bb))
 996     return true;
 997   return false;
 998 }
 999
1000
1001 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1002    stmt_vec_info structs for all the stmts in LOOP_IN.  */
1003
1004 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
1005   : vec_info (vec_info::loop, shared),
1006     loop (loop_in),
1007     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
1008     num_itersm1 (NULL_TREE),
1009     num_iters (NULL_TREE),
1010     num_iters_unchanged (NULL_TREE),
1011     num_iters_assumptions (NULL_TREE),
1012     vector_costs (nullptr),
1013     scalar_costs (nullptr),
1014     th (0),
1015     versioning_threshold (0),
1016     vectorization_factor (0),
1017     main_loop_edge (nullptr),
1018     skip_main_loop_edge (nullptr),
1019     skip_this_loop_edge (nullptr),
1020     reusable_accumulators (),
1021     suggested_unroll_factor (1),
1022     max_vectorization_factor (0),
1023     mask_skip_niters (NULL_TREE),
1024     rgroup_compare_type (NULL_TREE),
1025     simd_if_cond (NULL_TREE),
1026     partial_vector_style (vect_partial_vectors_none),
1027     unaligned_dr (NULL),
1028     peeling_for_alignment (0),
1029     ptr_mask (0),
1030     ivexpr_map (NULL),
1031     scan_map (NULL),
1032     slp_unrolling_factor (1),
1033     inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
1034     vectorizable (false),
1035     can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
1036     using_partial_vectors_p (false),
1037     using_decrementing_iv_p (false),
1038     using_select_vl_p (false),
1039     epil_using_partial_vectors_p (false),
1040     partial_load_store_bias (0),
1041     peeling_for_gaps (false),
1042     peeling_for_niter (false),
1043     early_breaks (false),
1044     no_data_dependencies (false),
1045     has_mask_store (false),
1046     scalar_loop_scaling (profile_probability::uninitialized ()),
1047     scalar_loop (NULL),
1048     orig_loop_info (NULL),
1049     vec_loop_iv_exit (NULL),
1050     vec_epilogue_loop_iv_exit (NULL),
1051     scalar_loop_iv_exit (NULL)
1052 {
1053   /* CHECKME: We want to visit all BBs before their successors (except for
1054      latch blocks, for which this assertion wouldn't hold).  In the simple
1055      case of the loop forms we allow, a dfs order of the BBs would the same
1056      as reversed postorder traversal, so we are safe.  */
1057
1058   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1059                                           bbs, loop->num_nodes, loop);
1060   gcc_assert (nbbs == loop->num_nodes);
1061
1062   for (unsigned int i = 0; i < nbbs; i++)
1063     {
1064       basic_block bb = bbs[i];
1065       gimple_stmt_iterator si;
1066
1067       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1068         {
1069           gimple *phi = gsi_stmt (si);
1070           gimple_set_uid (phi, 0);
1071           add_stmt (phi);
1072         }
1073
1074       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1075         {
1076           gimple *stmt = gsi_stmt (si);
1077           gimple_set_uid (stmt, 0);
1078           if (is_gimple_debug (stmt))
1079             continue;
1080           add_stmt (stmt);
1081           /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
1082              third argument is the #pragma omp simd if (x) condition, when 0,
1083              loop shouldn't be vectorized, when non-zero constant, it should
1084              be vectorized normally, otherwise versioned with vectorized loop
1085              done if the condition is non-zero at runtime.  */
1086           if (loop_in->simduid
1087               && is_gimple_call (stmt)
1088               && gimple_call_internal_p (stmt)
1089               && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
1090               && gimple_call_num_args (stmt) >= 3
1091               && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
1092               && (loop_in->simduid
1093                   == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
1094             {
1095               tree arg = gimple_call_arg (stmt, 2);
1096               if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
1097                 simd_if_cond = arg;
1098               else
1099                 gcc_assert (integer_nonzerop (arg));
1100             }
1101         }
1102     }
1103
1104   epilogue_vinfos.create (6);
1105 }
1106
1107 /* Free all levels of rgroup CONTROLS.  */
1108
1109 void
1110 release_vec_loop_controls (vec<rgroup_controls> *controls)
1111 {
1112   rgroup_controls *rgc;
1113   unsigned int i;
1114   FOR_EACH_VEC_ELT (*controls, i, rgc)
1115     rgc->controls.release ();
1116   controls->release ();
1117 }
1118
1119 /* Free all memory used by the _loop_vec_info, as well as all the
1120    stmt_vec_info structs of all the stmts in the loop.  */
1121
1122 _loop_vec_info::~_loop_vec_info ()
1123 {
1124   free (bbs);
1125
1126   release_vec_loop_controls (&masks.rgc_vec);
1127   release_vec_loop_controls (&lens);
1128   delete ivexpr_map;
1129   delete scan_map;
1130   epilogue_vinfos.release ();
1131   delete scalar_costs;
1132   delete vector_costs;
1133
1134   /* When we release an epiloge vinfo that we do not intend to use
1135      avoid clearing AUX of the main loop which should continue to
1136      point to the main loop vinfo since otherwise we'll leak that.  */
1137   if (loop->aux == this)
1138     loop->aux = NULL;
1139 }
1140
1141 /* Return an invariant or register for EXPR and emit necessary
1142    computations in the LOOP_VINFO loop preheader.  */
1143
1144 tree
1145 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
1146 {
1147   if (is_gimple_reg (expr)
1148       || is_gimple_min_invariant (expr))
1149     return expr;
1150
1151   if (! loop_vinfo->ivexpr_map)
1152     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
1153   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
1154   if (! cached)
1155     {
1156       gimple_seq stmts = NULL;
1157       cached = force_gimple_operand (unshare_expr (expr),
1158                                      &stmts, true, NULL_TREE);
1159       if (stmts)
1160         {
1161           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1162           gsi_insert_seq_on_edge_immediate (e, stmts);
1163         }
1164     }
1165   return cached;
1166 }
1167
1168 /* Return true if we can use CMP_TYPE as the comparison type to produce
1169    all masks required to mask LOOP_VINFO.  */
1170
1171 static bool
1172 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1173 {
1174   rgroup_controls *rgm;
1175   unsigned int i;
1176   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1177     if (rgm->type != NULL_TREE
1178         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1179                                             cmp_type, rgm->type,
1180                                             OPTIMIZE_FOR_SPEED))
1181       return false;
1182   return true;
1183 }
1184
1185 /* Calculate the maximum number of scalars per iteration for every
1186    rgroup in LOOP_VINFO.  */
1187
1188 static unsigned int
1189 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1190 {
1191   unsigned int res = 1;
1192   unsigned int i;
1193   rgroup_controls *rgm;
1194   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1195     res = MAX (res, rgm->max_nscalars_per_iter);
1196   return res;
1197 }
1198
1199 /* Calculate the minimum precision necessary to represent:
1200
1201       MAX_NITERS * FACTOR
1202
1203    as an unsigned integer, where MAX_NITERS is the maximum number of
1204    loop header iterations for the original scalar form of LOOP_VINFO.  */
1205
1206 static unsigned
1207 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1208 {
1209   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1210
1211   /* Get the maximum number of iterations that is representable
1212      in the counter type.  */
1213   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1214   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1215
1216   /* Get a more refined estimate for the number of iterations.  */
1217   widest_int max_back_edges;
1218   if (max_loop_iterations (loop, &max_back_edges))
1219     max_ni = wi::smin (max_ni, max_back_edges + 1);
1220
1221   /* Work out how many bits we need to represent the limit.  */
1222   return wi::min_precision (max_ni * factor, UNSIGNED);
1223 }
1224
1225 /* True if the loop needs peeling or partial vectors when vectorized.  */
1226
1227 static bool
1228 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1229 {
1230   unsigned HOST_WIDE_INT const_vf;
1231   HOST_WIDE_INT max_niter
1232     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1233
1234   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1235   if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1236     th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1237                                           (loop_vinfo));
1238
1239   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1240       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1241     {
1242       /* Work out the (constant) number of iterations that need to be
1243          peeled for reasons other than niters.  */
1244       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1245       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1246         peel_niter += 1;
1247       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1248                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1249         return true;
1250     }
1251   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1252       /* ??? When peeling for gaps but not alignment, we could
1253          try to check whether the (variable) niters is known to be
1254          VF * N + 1.  That's something of a niche case though.  */
1255       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1256       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1257       || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1258            < (unsigned) exact_log2 (const_vf))
1259           /* In case of versioning, check if the maximum number of
1260              iterations is greater than th.  If they are identical,
1261              the epilogue is unnecessary.  */
1262           && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1263               || ((unsigned HOST_WIDE_INT) max_niter
1264                   > (th / const_vf) * const_vf))))
1265     return true;
1266
1267   return false;
1268 }
1269
1270 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1271    whether we can actually generate the masks required.  Return true if so,
1272    storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE.  */
1273
1274 static bool
1275 vect_verify_full_masking (loop_vec_info loop_vinfo)
1276 {
1277   unsigned int min_ni_width;
1278
1279   /* Use a normal loop if there are no statements that need masking.
1280      This only happens in rare degenerate cases: it means that the loop
1281      has no loads, no stores, and no live-out values.  */
1282   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1283     return false;
1284
1285   /* Produce the rgroup controls.  */
1286   for (auto mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1287     {
1288       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1289       tree vectype = mask.first;
1290       unsigned nvectors = mask.second;
1291
1292       if (masks->rgc_vec.length () < nvectors)
1293         masks->rgc_vec.safe_grow_cleared (nvectors, true);
1294       rgroup_controls *rgm = &(*masks).rgc_vec[nvectors - 1];
1295       /* The number of scalars per iteration and the number of vectors are
1296          both compile-time constants.  */
1297       unsigned int nscalars_per_iter
1298           = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1299                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1300
1301       if (rgm->max_nscalars_per_iter < nscalars_per_iter)
1302         {
1303           rgm->max_nscalars_per_iter = nscalars_per_iter;
1304           rgm->type = truth_type_for (vectype);
1305           rgm->factor = 1;
1306         }
1307     }
1308
1309   unsigned int max_nscalars_per_iter
1310     = vect_get_max_nscalars_per_iter (loop_vinfo);
1311
1312   /* Work out how many bits we need to represent the limit.  */
1313   min_ni_width
1314     = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1315
1316   /* Find a scalar mode for which WHILE_ULT is supported.  */
1317   opt_scalar_int_mode cmp_mode_iter;
1318   tree cmp_type = NULL_TREE;
1319   tree iv_type = NULL_TREE;
1320   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1321   unsigned int iv_precision = UINT_MAX;
1322
1323   if (iv_limit != -1)
1324     iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1325                                       UNSIGNED);
1326
1327   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1328     {
1329       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1330       if (cmp_bits >= min_ni_width
1331           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1332         {
1333           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1334           if (this_type
1335               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1336             {
1337               /* Although we could stop as soon as we find a valid mode,
1338                  there are at least two reasons why that's not always the
1339                  best choice:
1340
1341                  - An IV that's Pmode or wider is more likely to be reusable
1342                    in address calculations than an IV that's narrower than
1343                    Pmode.
1344
1345                  - Doing the comparison in IV_PRECISION or wider allows
1346                    a natural 0-based IV, whereas using a narrower comparison
1347                    type requires mitigations against wrap-around.
1348
1349                  Conversely, if the IV limit is variable, doing the comparison
1350                  in a wider type than the original type can introduce
1351                  unnecessary extensions, so picking the widest valid mode
1352                  is not always a good choice either.
1353
1354                  Here we prefer the first IV type that's Pmode or wider,
1355                  and the first comparison type that's IV_PRECISION or wider.
1356                  (The comparison type must be no wider than the IV type,
1357                  to avoid extensions in the vector loop.)
1358
1359                  ??? We might want to try continuing beyond Pmode for ILP32
1360                  targets if CMP_BITS < IV_PRECISION.  */
1361               iv_type = this_type;
1362               if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1363                 cmp_type = this_type;
1364               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1365                 break;
1366             }
1367         }
1368     }
1369
1370   if (!cmp_type)
1371     {
1372       LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.release ();
1373       return false;
1374     }
1375
1376   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1377   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1378   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_while_ult;
1379   return true;
1380 }
1381
1382 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1383    whether we can actually generate AVX512 style masks.  Return true if so,
1384    storing the type of the scalar IV in LOOP_VINFO_RGROUP_IV_TYPE.  */
1385
1386 static bool
1387 vect_verify_full_masking_avx512 (loop_vec_info loop_vinfo)
1388 {
1389   /* Produce differently organized rgc_vec and differently check
1390      we can produce masks.  */
1391
1392   /* Use a normal loop if there are no statements that need masking.
1393      This only happens in rare degenerate cases: it means that the loop
1394      has no loads, no stores, and no live-out values.  */
1395   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1396     return false;
1397
1398   /* For the decrementing IV we need to represent all values in
1399      [0, niter + niter_skip] where niter_skip is the elements we
1400      skip in the first iteration for prologue peeling.  */
1401   tree iv_type = NULL_TREE;
1402   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1403   unsigned int iv_precision = UINT_MAX;
1404   if (iv_limit != -1)
1405     iv_precision = wi::min_precision (iv_limit, UNSIGNED);
1406
1407   /* First compute the type for the IV we use to track the remaining
1408      scalar iterations.  */
1409   opt_scalar_int_mode cmp_mode_iter;
1410   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1411     {
1412       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1413       if (cmp_bits >= iv_precision
1414           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1415         {
1416           iv_type = build_nonstandard_integer_type (cmp_bits, true);
1417           if (iv_type)
1418             break;
1419         }
1420     }
1421   if (!iv_type)
1422     return false;
1423
1424   /* Produce the rgroup controls.  */
1425   for (auto const &mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1426     {
1427       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1428       tree vectype = mask.first;
1429       unsigned nvectors = mask.second;
1430
1431       /* The number of scalars per iteration and the number of vectors are
1432          both compile-time constants.  */
1433       unsigned int nscalars_per_iter
1434         = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1435                      LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1436
1437       /* We index the rgroup_controls vector with nscalars_per_iter
1438          which we keep constant and instead have a varying nvectors,
1439          remembering the vector mask with the fewest nV.  */
1440       if (masks->rgc_vec.length () < nscalars_per_iter)
1441         masks->rgc_vec.safe_grow_cleared (nscalars_per_iter, true);
1442       rgroup_controls *rgm = &(*masks).rgc_vec[nscalars_per_iter - 1];
1443
1444       if (!rgm->type || rgm->factor > nvectors)
1445         {
1446           rgm->type = truth_type_for (vectype);
1447           rgm->compare_type = NULL_TREE;
1448           rgm->max_nscalars_per_iter = nscalars_per_iter;
1449           rgm->factor = nvectors;
1450           rgm->bias_adjusted_ctrl = NULL_TREE;
1451         }
1452     }
1453
1454   /* There is no fixed compare type we are going to use but we have to
1455      be able to get at one for each mask group.  */
1456   unsigned int min_ni_width
1457     = wi::min_precision (vect_max_vf (loop_vinfo), UNSIGNED);
1458
1459   bool ok = true;
1460   for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
1461     {
1462       tree mask_type = rgc.type;
1463       if (!mask_type)
1464         continue;
1465
1466       /* For now vect_get_loop_mask only supports integer mode masks
1467          when we need to split it.  */
1468       if (GET_MODE_CLASS (TYPE_MODE (mask_type)) != MODE_INT
1469           || TYPE_PRECISION (TREE_TYPE (mask_type)) != 1)
1470         {
1471           ok = false;
1472           break;
1473         }
1474
1475       /* If iv_type is usable as compare type use that - we can elide the
1476          saturation in that case.   */
1477       if (TYPE_PRECISION (iv_type) >= min_ni_width)
1478         {
1479           tree cmp_vectype
1480             = build_vector_type (iv_type, TYPE_VECTOR_SUBPARTS (mask_type));
1481           if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1482             rgc.compare_type = cmp_vectype;
1483         }
1484       if (!rgc.compare_type)
1485         FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1486           {
1487             unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1488             if (cmp_bits >= min_ni_width
1489                 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1490               {
1491                 tree cmp_type = build_nonstandard_integer_type (cmp_bits, true);
1492                 if (!cmp_type)
1493                   continue;
1494
1495                 /* Check whether we can produce the mask with cmp_type.  */
1496                 tree cmp_vectype
1497                   = build_vector_type (cmp_type, TYPE_VECTOR_SUBPARTS (mask_type));
1498                 if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1499                   {
1500                     rgc.compare_type = cmp_vectype;
1501                     break;
1502                   }
1503               }
1504         }
1505       if (!rgc.compare_type)
1506         {
1507           ok = false;
1508           break;
1509         }
1510     }
1511   if (!ok)
1512     {
1513       release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
1514       return false;
1515     }
1516
1517   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = error_mark_node;
1518   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1519   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_avx512;
1520   return true;
1521 }
1522
1523 /* Check whether we can use vector access with length based on precison
1524    comparison.  So far, to keep it simple, we only allow the case that the
1525    precision of the target supported length is larger than the precision
1526    required by loop niters.  */
1527
1528 static bool
1529 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1530 {
1531   if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1532     return false;
1533
1534   machine_mode len_load_mode, len_store_mode;
1535   if (!get_len_load_store_mode (loop_vinfo->vector_mode, true)
1536          .exists (&len_load_mode))
1537     return false;
1538   if (!get_len_load_store_mode (loop_vinfo->vector_mode, false)
1539          .exists (&len_store_mode))
1540     return false;
1541
1542   signed char partial_load_bias = internal_len_load_store_bias
1543     (IFN_LEN_LOAD, len_load_mode);
1544
1545   signed char partial_store_bias = internal_len_load_store_bias
1546     (IFN_LEN_STORE, len_store_mode);
1547
1548   gcc_assert (partial_load_bias == partial_store_bias);
1549
1550   if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1551     return false;
1552
1553   /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1554      len_loads with a length of zero.  In order to avoid that we prohibit
1555      more than one loop length here.  */
1556   if (partial_load_bias == -1
1557       && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1558     return false;
1559
1560   LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1561
1562   unsigned int max_nitems_per_iter = 1;
1563   unsigned int i;
1564   rgroup_controls *rgl;
1565   /* Find the maximum number of items per iteration for every rgroup.  */
1566   FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1567     {
1568       unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1569       max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1570     }
1571
1572   /* Work out how many bits we need to represent the length limit.  */
1573   unsigned int min_ni_prec
1574     = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1575
1576   /* Now use the maximum of below precisions for one suitable IV type:
1577      - the IV's natural precision
1578      - the precision needed to hold: the maximum number of scalar
1579        iterations multiplied by the scale factor (min_ni_prec above)
1580      - the Pmode precision
1581
1582      If min_ni_prec is less than the precision of the current niters,
1583      we perfer to still use the niters type.  Prefer to use Pmode and
1584      wider IV to avoid narrow conversions.  */
1585
1586   unsigned int ni_prec
1587     = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1588   min_ni_prec = MAX (min_ni_prec, ni_prec);
1589   min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1590
1591   tree iv_type = NULL_TREE;
1592   opt_scalar_int_mode tmode_iter;
1593   FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1594     {
1595       scalar_mode tmode = tmode_iter.require ();
1596       unsigned int tbits = GET_MODE_BITSIZE (tmode);
1597
1598       /* ??? Do we really want to construct one IV whose precision exceeds
1599          BITS_PER_WORD?  */
1600       if (tbits > BITS_PER_WORD)
1601         break;
1602
1603       /* Find the first available standard integral type.  */
1604       if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1605         {
1606           iv_type = build_nonstandard_integer_type (tbits, true);
1607           break;
1608         }
1609     }
1610
1611   if (!iv_type)
1612     {
1613       if (dump_enabled_p ())
1614         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1615                          "can't vectorize with length-based partial vectors"
1616                          " because there is no suitable iv type.\n");
1617       return false;
1618     }
1619
1620   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1621   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1622   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_len;
1623
1624   return true;
1625 }
1626
1627 /* Calculate the cost of one scalar iteration of the loop.  */
1628 static void
1629 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1630 {
1631   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1632   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1633   int nbbs = loop->num_nodes, factor;
1634   int innerloop_iters, i;
1635
1636   DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1637
1638   /* Gather costs for statements in the scalar loop.  */
1639
1640   /* FORNOW.  */
1641   innerloop_iters = 1;
1642   if (loop->inner)
1643     innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1644
1645   for (i = 0; i < nbbs; i++)
1646     {
1647       gimple_stmt_iterator si;
1648       basic_block bb = bbs[i];
1649
1650       if (bb->loop_father == loop->inner)
1651         factor = innerloop_iters;
1652       else
1653         factor = 1;
1654
1655       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1656         {
1657           gimple *stmt = gsi_stmt (si);
1658           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1659
1660           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1661             continue;
1662
1663           /* Skip stmts that are not vectorized inside the loop.  */
1664           stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1665           if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1666               && (!STMT_VINFO_LIVE_P (vstmt_info)
1667                   || !VECTORIZABLE_CYCLE_DEF
1668                         (STMT_VINFO_DEF_TYPE (vstmt_info))))
1669             continue;
1670
1671           vect_cost_for_stmt kind;
1672           if (STMT_VINFO_DATA_REF (stmt_info))
1673             {
1674               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1675                kind = scalar_load;
1676              else
1677                kind = scalar_store;
1678             }
1679           else if (vect_nop_conversion_p (stmt_info))
1680             continue;
1681           else
1682             kind = scalar_stmt;
1683
1684           /* We are using vect_prologue here to avoid scaling twice
1685              by the inner loop factor.  */
1686           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1687                             factor, kind, stmt_info, 0, vect_prologue);
1688         }
1689     }
1690
1691   /* Now accumulate cost.  */
1692   loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1693   add_stmt_costs (loop_vinfo->scalar_costs,
1694                   &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1695   loop_vinfo->scalar_costs->finish_cost (nullptr);
1696 }
1697
1698 /* Function vect_analyze_loop_form.
1699
1700    Verify that certain CFG restrictions hold, including:
1701    - the loop has a pre-header
1702    - the loop has a single entry
1703    - nested loops can have only a single exit.
1704    - the loop exit condition is simple enough
1705    - the number of iterations can be analyzed, i.e, a countable loop.  The
1706      niter could be analyzed under some assumptions.  */
1707
1708 opt_result
1709 vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info)
1710 {
1711   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1712
1713   edge exit_e = vec_init_loop_exit_info (loop);
1714   if (!exit_e)
1715     return opt_result::failure_at (vect_location,
1716                                    "not vectorized:"
1717                                    " could not determine main exit from"
1718                                    " loop with multiple exits.\n");
1719   info->loop_exit = exit_e;
1720   if (dump_enabled_p ())
1721       dump_printf_loc (MSG_NOTE, vect_location,
1722                        "using as main loop exit: %d -> %d [AUX: %p]\n",
1723                        exit_e->src->index, exit_e->dest->index, exit_e->aux);
1724
1725   /* Check if we have any control flow that doesn't leave the loop.  */
1726   class loop *v_loop = loop->inner ? loop->inner : loop;
1727   basic_block *bbs= get_loop_body (v_loop);
1728   for (unsigned i = 0; i < v_loop->num_nodes; i++)
1729     if (EDGE_COUNT (bbs[i]->succs) != 1
1730         && (EDGE_COUNT (bbs[i]->succs) != 2
1731             || !loop_exits_from_bb_p (bbs[i]->loop_father, bbs[i])))
1732       return opt_result::failure_at (vect_location,
1733                                      "not vectorized:"
1734                                      " unsupported control flow in loop.\n");
1735
1736   /* Different restrictions apply when we are considering an inner-most loop,
1737      vs. an outer (nested) loop.
1738      (FORNOW. May want to relax some of these restrictions in the future).  */
1739
1740   info->inner_loop_cond = NULL;
1741   if (!loop->inner)
1742     {
1743       /* Inner-most loop.  We currently require that the number of BBs is
1744          exactly 2 (the header and latch).  Vectorizable inner-most loops
1745          look like this:
1746
1747                         (pre-header)
1748                            |
1749                           header <--------+
1750                            | |            |
1751                            | +--> latch --+
1752                            |
1753                         (exit-bb)  */
1754
1755       if (empty_block_p (loop->header))
1756         return opt_result::failure_at (vect_location,
1757                                        "not vectorized: empty loop.\n");
1758     }
1759   else
1760     {
1761       class loop *innerloop = loop->inner;
1762       edge entryedge;
1763
1764       /* Nested loop. We currently require that the loop is doubly-nested,
1765          contains a single inner loop, and the number of BBs is exactly 5.
1766          Vectorizable outer-loops look like this:
1767
1768                         (pre-header)
1769                            |
1770                           header <---+
1771                            |         |
1772                           inner-loop |
1773                            |         |
1774                           tail ------+
1775                            |
1776                         (exit-bb)
1777
1778          The inner-loop has the properties expected of inner-most loops
1779          as described above.  */
1780
1781       if ((loop->inner)->inner || (loop->inner)->next)
1782         return opt_result::failure_at (vect_location,
1783                                        "not vectorized:"
1784                                        " multiple nested loops.\n");
1785
1786       entryedge = loop_preheader_edge (innerloop);
1787       if (entryedge->src != loop->header
1788           || !single_exit (innerloop)
1789           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1790         return opt_result::failure_at (vect_location,
1791                                        "not vectorized:"
1792                                        " unsupported outerloop form.\n");
1793
1794       /* Analyze the inner-loop.  */
1795       vect_loop_form_info inner;
1796       opt_result res = vect_analyze_loop_form (loop->inner, &inner);
1797       if (!res)
1798         {
1799           if (dump_enabled_p ())
1800             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1801                              "not vectorized: Bad inner loop.\n");
1802           return res;
1803         }
1804
1805       /* Don't support analyzing niter under assumptions for inner
1806          loop.  */
1807       if (!integer_onep (inner.assumptions))
1808         return opt_result::failure_at (vect_location,
1809                                        "not vectorized: Bad inner loop.\n");
1810
1811       if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1812         return opt_result::failure_at (vect_location,
1813                                        "not vectorized: inner-loop count not"
1814                                        " invariant.\n");
1815
1816       if (dump_enabled_p ())
1817         dump_printf_loc (MSG_NOTE, vect_location,
1818                          "Considering outer-loop vectorization.\n");
1819       info->inner_loop_cond = inner.conds[0];
1820     }
1821
1822   if (EDGE_COUNT (loop->header->preds) != 2)
1823     return opt_result::failure_at (vect_location,
1824                                    "not vectorized:"
1825                                    " too many incoming edges.\n");
1826
1827   /* We assume that the loop exit condition is at the end of the loop. i.e,
1828      that the loop is represented as a do-while (with a proper if-guard
1829      before the loop if needed), where the loop header contains all the
1830      executable statements, and the latch is empty.  */
1831   if (!empty_block_p (loop->latch)
1832       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1833     return opt_result::failure_at (vect_location,
1834                                    "not vectorized: latch block not empty.\n");
1835
1836   /* Make sure the exit is not abnormal.  */
1837   auto_vec<edge> exits = get_loop_exit_edges (loop);
1838   for (edge e : exits)
1839     {
1840       if (e->flags & EDGE_ABNORMAL)
1841         return opt_result::failure_at (vect_location,
1842                                        "not vectorized:"
1843                                        " abnormal loop exit edge.\n");
1844     }
1845
1846   info->conds
1847     = vect_get_loop_niters (loop, exit_e, &info->assumptions,
1848                             &info->number_of_iterations,
1849                             &info->number_of_iterationsm1);
1850
1851   if (info->conds.is_empty ())
1852     return opt_result::failure_at
1853       (vect_location,
1854        "not vectorized: complicated exit condition.\n");
1855
1856   /* Determine what the primary and alternate exit conds are.  */
1857   for (unsigned i = 0; i < info->conds.length (); i++)
1858     {
1859       gcond *cond = info->conds[i];
1860       if (exit_e->src == gimple_bb (cond))
1861         std::swap (info->conds[0], info->conds[i]);
1862     }
1863
1864   if (integer_zerop (info->assumptions)
1865       || !info->number_of_iterations
1866       || chrec_contains_undetermined (info->number_of_iterations))
1867     return opt_result::failure_at
1868       (info->conds[0],
1869        "not vectorized: number of iterations cannot be computed.\n");
1870
1871   if (integer_zerop (info->number_of_iterations))
1872     return opt_result::failure_at
1873       (info->conds[0],
1874        "not vectorized: number of iterations = 0.\n");
1875
1876   if (!(tree_fits_shwi_p (info->number_of_iterations)
1877         && tree_to_shwi (info->number_of_iterations) > 0))
1878     {
1879       if (dump_enabled_p ())
1880         {
1881           dump_printf_loc (MSG_NOTE, vect_location,
1882                            "Symbolic number of iterations is ");
1883           dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1884           dump_printf (MSG_NOTE, "\n");
1885         }
1886     }
1887
1888   return opt_result::success ();
1889 }
1890
1891 /* Create a loop_vec_info for LOOP with SHARED and the
1892    vect_analyze_loop_form result.  */
1893
1894 loop_vec_info
1895 vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1896                         const vect_loop_form_info *info,
1897                         loop_vec_info main_loop_info)
1898 {
1899   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1900   LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1901   LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1902   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1903   LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = main_loop_info;
1904   /* Also record the assumptions for versioning.  */
1905   if (!integer_onep (info->assumptions) && !main_loop_info)
1906     LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1907
1908   for (gcond *cond : info->conds)
1909     {
1910       stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (cond);
1911       STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1912       /* Mark the statement as a condition.  */
1913       STMT_VINFO_DEF_TYPE (loop_cond_info) = vect_condition_def;
1914     }
1915
1916   for (unsigned i = 1; i < info->conds.length (); i ++)
1917     LOOP_VINFO_LOOP_CONDS (loop_vinfo).safe_push (info->conds[i]);
1918   LOOP_VINFO_LOOP_IV_COND (loop_vinfo) = info->conds[0];
1919
1920   LOOP_VINFO_IV_EXIT (loop_vinfo) = info->loop_exit;
1921
1922   /* Check to see if we're vectorizing multiple exits.  */
1923   LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
1924     = !LOOP_VINFO_LOOP_CONDS (loop_vinfo).is_empty ();
1925
1926   if (info->inner_loop_cond)
1927     {
1928       stmt_vec_info inner_loop_cond_info
1929         = loop_vinfo->lookup_stmt (info->inner_loop_cond);
1930       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1931       /* If we have an estimate on the number of iterations of the inner
1932          loop use that to limit the scale for costing, otherwise use
1933          --param vect-inner-loop-cost-factor literally.  */
1934       widest_int nit;
1935       if (estimated_stmt_executions (loop->inner, &nit))
1936         LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1937           = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1938     }
1939
1940   return loop_vinfo;
1941 }
1942
1943
1944
1945 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1946    statements update the vectorization factor.  */
1947
1948 static void
1949 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1950 {
1951   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1952   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1953   int nbbs = loop->num_nodes;
1954   poly_uint64 vectorization_factor;
1955   int i;
1956
1957   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1958
1959   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1960   gcc_assert (known_ne (vectorization_factor, 0U));
1961
1962   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1963      vectorization factor of the loop is the unrolling factor required by
1964      the SLP instances.  If that unrolling factor is 1, we say, that we
1965      perform pure SLP on loop - cross iteration parallelism is not
1966      exploited.  */
1967   bool only_slp_in_loop = true;
1968   for (i = 0; i < nbbs; i++)
1969     {
1970       basic_block bb = bbs[i];
1971       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1972            gsi_next (&si))
1973         {
1974           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1975           if (!stmt_info)
1976             continue;
1977           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1978                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1979               && !PURE_SLP_STMT (stmt_info))
1980             /* STMT needs both SLP and loop-based vectorization.  */
1981             only_slp_in_loop = false;
1982         }
1983       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1984            gsi_next (&si))
1985         {
1986           if (is_gimple_debug (gsi_stmt (si)))
1987             continue;
1988           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1989           stmt_info = vect_stmt_to_vectorize (stmt_info);
1990           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1991                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1992               && !PURE_SLP_STMT (stmt_info))
1993             /* STMT needs both SLP and loop-based vectorization.  */
1994             only_slp_in_loop = false;
1995         }
1996     }
1997
1998   if (only_slp_in_loop)
1999     {
2000       if (dump_enabled_p ())
2001         dump_printf_loc (MSG_NOTE, vect_location,
2002                          "Loop contains only SLP stmts\n");
2003       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
2004     }
2005   else
2006     {
2007       if (dump_enabled_p ())
2008         dump_printf_loc (MSG_NOTE, vect_location,
2009                          "Loop contains SLP and non-SLP stmts\n");
2010       /* Both the vectorization factor and unroll factor have the form
2011          GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
2012          so they must have a common multiple.  */
2013       vectorization_factor
2014         = force_common_multiple (vectorization_factor,
2015                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
2016     }
2017
2018   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
2019   if (dump_enabled_p ())
2020     {
2021       dump_printf_loc (MSG_NOTE, vect_location,
2022                        "Updating vectorization factor to ");
2023       dump_dec (MSG_NOTE, vectorization_factor);
2024       dump_printf (MSG_NOTE, ".\n");
2025     }
2026 }
2027
2028 /* Return true if STMT_INFO describes a double reduction phi and if
2029    the other phi in the reduction is also relevant for vectorization.
2030    This rejects cases such as:
2031
2032       outer1:
2033         x_1 = PHI <x_3(outer2), ...>;
2034         ...
2035
2036       inner:
2037         x_2 = ...;
2038         ...
2039
2040       outer2:
2041         x_3 = PHI <x_2(inner)>;
2042
2043    if nothing in x_2 or elsewhere makes x_1 relevant.  */
2044
2045 static bool
2046 vect_active_double_reduction_p (stmt_vec_info stmt_info)
2047 {
2048   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2049     return false;
2050
2051   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
2052 }
2053
2054 /* Function vect_analyze_loop_operations.
2055
2056    Scan the loop stmts and make sure they are all vectorizable.  */
2057
2058 static opt_result
2059 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
2060 {
2061   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2062   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2063   int nbbs = loop->num_nodes;
2064   int i;
2065   stmt_vec_info stmt_info;
2066   bool need_to_vectorize = false;
2067   bool ok;
2068
2069   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
2070
2071   auto_vec<stmt_info_for_cost> cost_vec;
2072
2073   for (i = 0; i < nbbs; i++)
2074     {
2075       basic_block bb = bbs[i];
2076
2077       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
2078            gsi_next (&si))
2079         {
2080           gphi *phi = si.phi ();
2081           ok = true;
2082
2083           stmt_info = loop_vinfo->lookup_stmt (phi);
2084           if (dump_enabled_p ())
2085             dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G",
2086                              (gimple *) phi);
2087           if (virtual_operand_p (gimple_phi_result (phi)))
2088             continue;
2089
2090           /* Inner-loop loop-closed exit phi in outer-loop vectorization
2091              (i.e., a phi in the tail of the outer-loop).  */
2092           if (! is_loop_header_bb_p (bb))
2093             {
2094               /* FORNOW: we currently don't support the case that these phis
2095                  are not used in the outerloop (unless it is double reduction,
2096                  i.e., this phi is vect_reduction_def), cause this case
2097                  requires to actually do something here.  */
2098               if (STMT_VINFO_LIVE_P (stmt_info)
2099                   && !vect_active_double_reduction_p (stmt_info))
2100                 return opt_result::failure_at (phi,
2101                                                "Unsupported loop-closed phi"
2102                                                " in outer-loop.\n");
2103
2104               /* If PHI is used in the outer loop, we check that its operand
2105                  is defined in the inner loop.  */
2106               if (STMT_VINFO_RELEVANT_P (stmt_info))
2107                 {
2108                   tree phi_op;
2109
2110                   if (gimple_phi_num_args (phi) != 1)
2111                     return opt_result::failure_at (phi, "unsupported phi");
2112
2113                   phi_op = PHI_ARG_DEF (phi, 0);
2114                   stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
2115                   if (!op_def_info)
2116                     return opt_result::failure_at (phi, "unsupported phi\n");
2117
2118                   if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
2119                       && (STMT_VINFO_RELEVANT (op_def_info)
2120                           != vect_used_in_outer_by_reduction))
2121                     return opt_result::failure_at (phi, "unsupported phi\n");
2122
2123                   if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2124                        || (STMT_VINFO_DEF_TYPE (stmt_info)
2125                            == vect_double_reduction_def))
2126                       && !vectorizable_lc_phi (loop_vinfo,
2127                                                stmt_info, NULL, NULL))
2128                     return opt_result::failure_at (phi, "unsupported phi\n");
2129                 }
2130
2131               continue;
2132             }
2133
2134           gcc_assert (stmt_info);
2135
2136           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
2137                || STMT_VINFO_LIVE_P (stmt_info))
2138               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def
2139               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
2140             /* A scalar-dependence cycle that we don't support.  */
2141             return opt_result::failure_at (phi,
2142                                            "not vectorized:"
2143                                            " scalar dependence cycle.\n");
2144
2145           if (STMT_VINFO_RELEVANT_P (stmt_info))
2146             {
2147               need_to_vectorize = true;
2148               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
2149                   && ! PURE_SLP_STMT (stmt_info))
2150                 ok = vectorizable_induction (loop_vinfo,
2151                                              stmt_info, NULL, NULL,
2152                                              &cost_vec);
2153               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2154                         || (STMT_VINFO_DEF_TYPE (stmt_info)
2155                             == vect_double_reduction_def)
2156                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
2157                        && ! PURE_SLP_STMT (stmt_info))
2158                 ok = vectorizable_reduction (loop_vinfo,
2159                                              stmt_info, NULL, NULL, &cost_vec);
2160               else if ((STMT_VINFO_DEF_TYPE (stmt_info)
2161                         == vect_first_order_recurrence)
2162                        && ! PURE_SLP_STMT (stmt_info))
2163                 ok = vectorizable_recurr (loop_vinfo, stmt_info, NULL, NULL,
2164                                            &cost_vec);
2165             }
2166
2167           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
2168           if (ok
2169               && STMT_VINFO_LIVE_P (stmt_info)
2170               && !PURE_SLP_STMT (stmt_info))
2171             ok = vectorizable_live_operation (loop_vinfo, stmt_info, NULL, NULL,
2172                                               -1, false, &cost_vec);
2173
2174           if (!ok)
2175             return opt_result::failure_at (phi,
2176                                            "not vectorized: relevant phi not "
2177                                            "supported: %G",
2178                                            static_cast <gimple *> (phi));
2179         }
2180
2181       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
2182            gsi_next (&si))
2183         {
2184           gimple *stmt = gsi_stmt (si);
2185           if (!gimple_clobber_p (stmt)
2186               && !is_gimple_debug (stmt))
2187             {
2188               opt_result res
2189                 = vect_analyze_stmt (loop_vinfo,
2190                                      loop_vinfo->lookup_stmt (stmt),
2191                                      &need_to_vectorize,
2192                                      NULL, NULL, &cost_vec);
2193               if (!res)
2194                 return res;
2195             }
2196         }
2197     } /* bbs */
2198
2199   add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
2200
2201   /* All operations in the loop are either irrelevant (deal with loop
2202      control, or dead), or only used outside the loop and can be moved
2203      out of the loop (e.g. invariants, inductions).  The loop can be
2204      optimized away by scalar optimizations.  We're better off not
2205      touching this loop.  */
2206   if (!need_to_vectorize)
2207     {
2208       if (dump_enabled_p ())
2209         dump_printf_loc (MSG_NOTE, vect_location,
2210                          "All the computation can be taken out of the loop.\n");
2211       return opt_result::failure_at
2212         (vect_location,
2213          "not vectorized: redundant loop. no profit to vectorize.\n");
2214     }
2215
2216   return opt_result::success ();
2217 }
2218
2219 /* Return true if we know that the iteration count is smaller than the
2220    vectorization factor.  Return false if it isn't, or if we can't be sure
2221    either way.  */
2222
2223 static bool
2224 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
2225 {
2226   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2227
2228   HOST_WIDE_INT max_niter;
2229   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2230     max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
2231   else
2232     max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2233
2234   if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
2235     return true;
2236
2237   return false;
2238 }
2239
2240 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
2241    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
2242    definitely no, or -1 if it's worth retrying.  */
2243
2244 static int
2245 vect_analyze_loop_costing (loop_vec_info loop_vinfo,
2246                            unsigned *suggested_unroll_factor)
2247 {
2248   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2249   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2250
2251   /* Only loops that can handle partially-populated vectors can have iteration
2252      counts less than the vectorization factor.  */
2253   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2254       && vect_known_niters_smaller_than_vf (loop_vinfo))
2255     {
2256       if (dump_enabled_p ())
2257         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2258                          "not vectorized: iteration count smaller than "
2259                          "vectorization factor.\n");
2260       return 0;
2261     }
2262
2263   /* If we know the number of iterations we can do better, for the
2264      epilogue we can also decide whether the main loop leaves us
2265      with enough iterations, prefering a smaller vector epilog then
2266      also possibly used for the case we skip the vector loop.  */
2267   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2268     {
2269       widest_int scalar_niters
2270         = wi::to_widest (LOOP_VINFO_NITERSM1 (loop_vinfo)) + 1;
2271       if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2272         {
2273           loop_vec_info orig_loop_vinfo
2274             = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2275           unsigned lowest_vf
2276             = constant_lower_bound (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo));
2277           int prolog_peeling = 0;
2278           if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2279             prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo);
2280           if (prolog_peeling >= 0
2281               && known_eq (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
2282                            lowest_vf))
2283             {
2284               unsigned gap
2285                 = LOOP_VINFO_PEELING_FOR_GAPS (orig_loop_vinfo) ? 1 : 0;
2286               scalar_niters = ((scalar_niters - gap - prolog_peeling)
2287                                % lowest_vf + gap);
2288             }
2289         }
2290       /* Reject vectorizing for a single scalar iteration, even if
2291          we could in principle implement that using partial vectors.  */
2292       unsigned peeling_gap = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2293       if (scalar_niters <= peeling_gap + 1)
2294         {
2295           if (dump_enabled_p ())
2296             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2297                              "not vectorized: loop only has a single "
2298                              "scalar iteration.\n");
2299           return 0;
2300         }
2301
2302       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2303         {
2304           /* Check that the loop processes at least one full vector.  */
2305           poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2306           if (known_lt (scalar_niters, vf))
2307             {
2308               if (dump_enabled_p ())
2309                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2310                                  "loop does not have enough iterations "
2311                                  "to support vectorization.\n");
2312               return 0;
2313             }
2314
2315           /* If we need to peel an extra epilogue iteration to handle data
2316              accesses with gaps, check that there are enough scalar iterations
2317              available.
2318
2319              The check above is redundant with this one when peeling for gaps,
2320              but the distinction is useful for diagnostics.  */
2321           if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2322               && known_le (scalar_niters, vf))
2323             {
2324               if (dump_enabled_p ())
2325                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2326                                  "loop does not have enough iterations "
2327                                  "to support peeling for gaps.\n");
2328               return 0;
2329             }
2330         }
2331     }
2332
2333   /* If using the "very cheap" model. reject cases in which we'd keep
2334      a copy of the scalar code (even if we might be able to vectorize it).  */
2335   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2336       && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2337           || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2338           || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
2339     {
2340       if (dump_enabled_p ())
2341         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2342                          "some scalar iterations would need to be peeled\n");
2343       return 0;
2344     }
2345
2346   int min_profitable_iters, min_profitable_estimate;
2347   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2348                                       &min_profitable_estimate,
2349                                       suggested_unroll_factor);
2350
2351   if (min_profitable_iters < 0)
2352     {
2353       if (dump_enabled_p ())
2354         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2355                          "not vectorized: vectorization not profitable.\n");
2356       if (dump_enabled_p ())
2357         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2358                          "not vectorized: vector version will never be "
2359                          "profitable.\n");
2360       return -1;
2361     }
2362
2363   int min_scalar_loop_bound = (param_min_vect_loop_bound
2364                                * assumed_vf);
2365
2366   /* Use the cost model only if it is more conservative than user specified
2367      threshold.  */
2368   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
2369                                     min_profitable_iters);
2370
2371   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2372
2373   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2374       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2375     {
2376       if (dump_enabled_p ())
2377         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2378                          "not vectorized: vectorization not profitable.\n");
2379       if (dump_enabled_p ())
2380         dump_printf_loc (MSG_NOTE, vect_location,
2381                          "not vectorized: iteration count smaller than user "
2382                          "specified loop bound parameter or minimum profitable "
2383                          "iterations (whichever is more conservative).\n");
2384       return 0;
2385     }
2386
2387   /* The static profitablity threshold min_profitable_estimate includes
2388      the cost of having to check at runtime whether the scalar loop
2389      should be used instead.  If it turns out that we don't need or want
2390      such a check, the threshold we should use for the static estimate
2391      is simply the point at which the vector loop becomes more profitable
2392      than the scalar loop.  */
2393   if (min_profitable_estimate > min_profitable_iters
2394       && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
2395       && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2396       && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2397       && !vect_apply_runtime_profitability_check_p (loop_vinfo))
2398     {
2399       if (dump_enabled_p ())
2400         dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
2401                          " choice between the scalar and vector loops\n");
2402       min_profitable_estimate = min_profitable_iters;
2403     }
2404
2405   /* If the vector loop needs multiple iterations to be beneficial then
2406      things are probably too close to call, and the conservative thing
2407      would be to stick with the scalar code.  */
2408   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2409       && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
2410     {
2411       if (dump_enabled_p ())
2412         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2413                          "one iteration of the vector loop would be"
2414                          " more expensive than the equivalent number of"
2415                          " iterations of the scalar loop\n");
2416       return 0;
2417     }
2418
2419   HOST_WIDE_INT estimated_niter;
2420
2421   /* If we are vectorizing an epilogue then we know the maximum number of
2422      scalar iterations it will cover is at least one lower than the
2423      vectorization factor of the main loop.  */
2424   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2425     estimated_niter
2426       = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
2427   else
2428     {
2429       estimated_niter = estimated_stmt_executions_int (loop);
2430       if (estimated_niter == -1)
2431         estimated_niter = likely_max_stmt_executions_int (loop);
2432     }
2433   if (estimated_niter != -1
2434       && ((unsigned HOST_WIDE_INT) estimated_niter
2435           < MAX (th, (unsigned) min_profitable_estimate)))
2436     {
2437       if (dump_enabled_p ())
2438         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2439                          "not vectorized: estimated iteration count too "
2440                          "small.\n");
2441       if (dump_enabled_p ())
2442         dump_printf_loc (MSG_NOTE, vect_location,
2443                          "not vectorized: estimated iteration count smaller "
2444                          "than specified loop bound parameter or minimum "
2445                          "profitable iterations (whichever is more "
2446                          "conservative).\n");
2447       return -1;
2448     }
2449
2450   return 1;
2451 }
2452
2453 static opt_result
2454 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
2455                            vec<data_reference_p> *datarefs,
2456                            unsigned int *n_stmts)
2457 {
2458   *n_stmts = 0;
2459   for (unsigned i = 0; i < loop->num_nodes; i++)
2460     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2461          !gsi_end_p (gsi); gsi_next (&gsi))
2462       {
2463         gimple *stmt = gsi_stmt (gsi);
2464         if (is_gimple_debug (stmt))
2465           continue;
2466         ++(*n_stmts);
2467         opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
2468                                                         NULL, 0);
2469         if (!res)
2470           {
2471             if (is_gimple_call (stmt) && loop->safelen)
2472               {
2473                 tree fndecl = gimple_call_fndecl (stmt), op;
2474                 if (fndecl == NULL_TREE
2475                     && gimple_call_internal_p (stmt, IFN_MASK_CALL))
2476                   {
2477                     fndecl = gimple_call_arg (stmt, 0);
2478                     gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
2479                     fndecl = TREE_OPERAND (fndecl, 0);
2480                     gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
2481                   }
2482                 if (fndecl != NULL_TREE)
2483                   {
2484                     cgraph_node *node = cgraph_node::get (fndecl);
2485                     if (node != NULL && node->simd_clones != NULL)
2486                       {
2487                         unsigned int j, n = gimple_call_num_args (stmt);
2488                         for (j = 0; j < n; j++)
2489                           {
2490                             op = gimple_call_arg (stmt, j);
2491                             if (DECL_P (op)
2492                                 || (REFERENCE_CLASS_P (op)
2493                                     && get_base_address (op)))
2494                               break;
2495                           }
2496                         op = gimple_call_lhs (stmt);
2497                         /* Ignore #pragma omp declare simd functions
2498                            if they don't have data references in the
2499                            call stmt itself.  */
2500                         if (j == n
2501                             && !(op
2502                                  && (DECL_P (op)
2503                                      || (REFERENCE_CLASS_P (op)
2504                                          && get_base_address (op)))))
2505                           continue;
2506                       }
2507                   }
2508               }
2509             return res;
2510           }
2511         /* If dependence analysis will give up due to the limit on the
2512            number of datarefs stop here and fail fatally.  */
2513         if (datarefs->length ()
2514             > (unsigned)param_loop_max_datarefs_for_datadeps)
2515           return opt_result::failure_at (stmt, "exceeded param "
2516                                          "loop-max-datarefs-for-datadeps\n");
2517       }
2518   return opt_result::success ();
2519 }
2520
2521 /* Look for SLP-only access groups and turn each individual access into its own
2522    group.  */
2523 static void
2524 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2525 {
2526   unsigned int i;
2527   struct data_reference *dr;
2528
2529   DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2530
2531   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2532   FOR_EACH_VEC_ELT (datarefs, i, dr)
2533     {
2534       gcc_assert (DR_REF (dr));
2535       stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
2536
2537       /* Check if the load is a part of an interleaving chain.  */
2538       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2539         {
2540           stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2541           dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2542           unsigned int group_size = DR_GROUP_SIZE (first_element);
2543
2544           /* Check if SLP-only groups.  */
2545           if (!STMT_SLP_TYPE (stmt_info)
2546               && STMT_VINFO_SLP_VECT_ONLY (first_element))
2547             {
2548               /* Dissolve the group.  */
2549               STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2550
2551               stmt_vec_info vinfo = first_element;
2552               while (vinfo)
2553                 {
2554                   stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2555                   DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2556                   DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2557                   DR_GROUP_SIZE (vinfo) = 1;
2558                   if (STMT_VINFO_STRIDED_P (first_element)
2559                       /* We cannot handle stores with gaps.  */
2560                       || DR_IS_WRITE (dr_info->dr))
2561                     {
2562                       STMT_VINFO_STRIDED_P (vinfo) = true;
2563                       DR_GROUP_GAP (vinfo) = 0;
2564                     }
2565                   else
2566                     DR_GROUP_GAP (vinfo) = group_size - 1;
2567                   /* Duplicate and adjust alignment info, it needs to
2568                      be present on each group leader, see dr_misalignment.  */
2569                   if (vinfo != first_element)
2570                     {
2571                       dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2572                       dr_info2->target_alignment = dr_info->target_alignment;
2573                       int misalignment = dr_info->misalignment;
2574                       if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2575                         {
2576                           HOST_WIDE_INT diff
2577                             = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2578                                - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2579                           unsigned HOST_WIDE_INT align_c
2580                             = dr_info->target_alignment.to_constant ();
2581                           misalignment = (misalignment + diff) % align_c;
2582                         }
2583                       dr_info2->misalignment = misalignment;
2584                     }
2585                   vinfo = next;
2586                 }
2587             }
2588         }
2589     }
2590 }
2591
2592 /* Determine if operating on full vectors for LOOP_VINFO might leave
2593    some scalar iterations still to do.  If so, decide how we should
2594    handle those scalar iterations.  The possibilities are:
2595
2596    (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2597        In this case:
2598
2599          LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2600          LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2601          LOOP_VINFO_PEELING_FOR_NITER == false
2602
2603    (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2604        to handle the remaining scalar iterations.  In this case:
2605
2606          LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2607          LOOP_VINFO_PEELING_FOR_NITER == true
2608
2609        There are two choices:
2610
2611        (2a) Consider vectorizing the epilogue loop at the same VF as the
2612             main loop, but using partial vectors instead of full vectors.
2613             In this case:
2614
2615               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2616
2617        (2b) Consider vectorizing the epilogue loop at lower VFs only.
2618             In this case:
2619
2620               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2621  */
2622
2623 opt_result
2624 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo)
2625 {
2626   /* Determine whether there would be any scalar iterations left over.  */
2627   bool need_peeling_or_partial_vectors_p
2628     = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2629
2630   /* Decide whether to vectorize the loop with partial vectors.  */
2631   LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2632   LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2633   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2634       && need_peeling_or_partial_vectors_p)
2635     {
2636       /* For partial-vector-usage=1, try to push the handling of partial
2637          vectors to the epilogue, with the main loop continuing to operate
2638          on full vectors.
2639
2640          If we are unrolling we also do not want to use partial vectors. This
2641          is to avoid the overhead of generating multiple masks and also to
2642          avoid having to execute entire iterations of FALSE masked instructions
2643          when dealing with one or less full iterations.
2644
2645          ??? We could then end up failing to use partial vectors if we
2646          decide to peel iterations into a prologue, and if the main loop
2647          then ends up processing fewer than VF iterations.  */
2648       if ((param_vect_partial_vector_usage == 1
2649            || loop_vinfo->suggested_unroll_factor > 1)
2650           && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2651           && !vect_known_niters_smaller_than_vf (loop_vinfo))
2652         LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2653       else
2654         LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2655     }
2656
2657   if (dump_enabled_p ())
2658     dump_printf_loc (MSG_NOTE, vect_location,
2659                      "operating on %s vectors%s.\n",
2660                      LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2661                      ? "partial" : "full",
2662                      LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2663                      ? " for epilogue loop" : "");
2664
2665   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2666     = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2667        && need_peeling_or_partial_vectors_p);
2668
2669   /* We set LOOP_VINFO_USING_SELECT_VL_P as true before loop vectorization
2670      analysis that we don't know whether the loop is vectorized by partial
2671      vectors (More details see tree-vect-loop-manip.cc).
2672
2673      However, SELECT_VL vectorizaton style should only applied on partial
2674      vectorization since SELECT_VL is the GIMPLE IR that calculates the
2675      number of elements to be process for each iteration.
2676
2677      After loop vectorization analysis, Clear LOOP_VINFO_USING_SELECT_VL_P
2678      if it is not partial vectorized loop.  */
2679   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2680     LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = false;
2681
2682   return opt_result::success ();
2683 }
2684
2685 /* Function vect_analyze_loop_2.
2686
2687    Apply a set of analyses on LOOP specified by LOOP_VINFO, the different
2688    analyses will record information in some members of LOOP_VINFO.  FATAL
2689    indicates if some analysis meets fatal error.  If one non-NULL pointer
2690    SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one
2691    worked out suggested unroll factor, while one NULL pointer shows it's
2692    going to apply the suggested unroll factor.  SLP_DONE_FOR_SUGGESTED_UF
2693    is to hold the slp decision when the suggested unroll factor is worked
2694    out.  */
2695 static opt_result
2696 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
2697                      unsigned *suggested_unroll_factor,
2698                      bool& slp_done_for_suggested_uf)
2699 {
2700   opt_result ok = opt_result::success ();
2701   int res;
2702   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2703   poly_uint64 min_vf = 2;
2704   loop_vec_info orig_loop_vinfo = NULL;
2705
2706   /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2707      loop_vec_info of the first vectorized loop.  */
2708   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2709     orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2710   else
2711     orig_loop_vinfo = loop_vinfo;
2712   gcc_assert (orig_loop_vinfo);
2713
2714   /* The first group of checks is independent of the vector size.  */
2715   fatal = true;
2716
2717   if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2718       && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2719     return opt_result::failure_at (vect_location,
2720                                    "not vectorized: simd if(0)\n");
2721
2722   /* Find all data references in the loop (which correspond to vdefs/vuses)
2723      and analyze their evolution in the loop.  */
2724
2725   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2726
2727   /* Gather the data references and count stmts in the loop.  */
2728   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2729     {
2730       opt_result res
2731         = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2732                                      &LOOP_VINFO_DATAREFS (loop_vinfo),
2733                                      &LOOP_VINFO_N_STMTS (loop_vinfo));
2734       if (!res)
2735         {
2736           if (dump_enabled_p ())
2737             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2738                              "not vectorized: loop contains function "
2739                              "calls or data references that cannot "
2740                              "be analyzed\n");
2741           return res;
2742         }
2743       loop_vinfo->shared->save_datarefs ();
2744     }
2745   else
2746     loop_vinfo->shared->check_datarefs ();
2747
2748   /* Analyze the data references and also adjust the minimal
2749      vectorization factor according to the loads and stores.  */
2750
2751   ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2752   if (!ok)
2753     {
2754       if (dump_enabled_p ())
2755         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2756                          "bad data references.\n");
2757       return ok;
2758     }
2759
2760   /* Check if we are applying unroll factor now.  */
2761   bool applying_suggested_uf = loop_vinfo->suggested_unroll_factor > 1;
2762   gcc_assert (!applying_suggested_uf || !suggested_unroll_factor);
2763
2764   /* If the slp decision is false when suggested unroll factor is worked
2765      out, and we are applying suggested unroll factor, we can simply skip
2766      all slp related analyses this time.  */
2767   bool slp = !applying_suggested_uf || slp_done_for_suggested_uf;
2768
2769   /* Classify all cross-iteration scalar data-flow cycles.
2770      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
2771   vect_analyze_scalar_cycles (loop_vinfo, slp);
2772
2773   vect_pattern_recog (loop_vinfo);
2774
2775   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2776
2777   /* Analyze the access patterns of the data-refs in the loop (consecutive,
2778      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
2779
2780   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2781   if (!ok)
2782     {
2783       if (dump_enabled_p ())
2784         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2785                          "bad data access.\n");
2786       return ok;
2787     }
2788
2789   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
2790
2791   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2792   if (!ok)
2793     {
2794       if (dump_enabled_p ())
2795         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2796                          "unexpected pattern.\n");
2797       return ok;
2798     }
2799
2800   /* While the rest of the analysis below depends on it in some way.  */
2801   fatal = false;
2802
2803   /* Analyze data dependences between the data-refs in the loop
2804      and adjust the maximum vectorization factor according to
2805      the dependences.
2806      FORNOW: fail at the first data dependence that we encounter.  */
2807
2808   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2809   if (!ok)
2810     {
2811       if (dump_enabled_p ())
2812         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2813                          "bad data dependence.\n");
2814       return ok;
2815     }
2816   if (max_vf != MAX_VECTORIZATION_FACTOR
2817       && maybe_lt (max_vf, min_vf))
2818     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2819   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2820
2821   ok = vect_determine_vectorization_factor (loop_vinfo);
2822   if (!ok)
2823     {
2824       if (dump_enabled_p ())
2825         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2826                          "can't determine vectorization factor.\n");
2827       return ok;
2828     }
2829
2830   /* Compute the scalar iteration cost.  */
2831   vect_compute_single_scalar_iteration_cost (loop_vinfo);
2832
2833   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2834
2835   if (slp)
2836     {
2837       /* Check the SLP opportunities in the loop, analyze and build
2838          SLP trees.  */
2839       ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo));
2840       if (!ok)
2841         return ok;
2842
2843       /* If there are any SLP instances mark them as pure_slp.  */
2844       slp = vect_make_slp_decision (loop_vinfo);
2845       if (slp)
2846         {
2847           /* Find stmts that need to be both vectorized and SLPed.  */
2848           vect_detect_hybrid_slp (loop_vinfo);
2849
2850           /* Update the vectorization factor based on the SLP decision.  */
2851           vect_update_vf_for_slp (loop_vinfo);
2852
2853           /* Optimize the SLP graph with the vectorization factor fixed.  */
2854           vect_optimize_slp (loop_vinfo);
2855
2856           /* Gather the loads reachable from the SLP graph entries.  */
2857           vect_gather_slp_loads (loop_vinfo);
2858         }
2859     }
2860
2861   bool saved_can_use_partial_vectors_p
2862     = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2863
2864   /* We don't expect to have to roll back to anything other than an empty
2865      set of rgroups.  */
2866   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2867
2868   /* This is the point where we can re-start analysis with SLP forced off.  */
2869 start_over:
2870
2871   /* Apply the suggested unrolling factor, this was determined by the backend
2872      during finish_cost the first time we ran the analyzis for this
2873      vector mode.  */
2874   if (applying_suggested_uf)
2875     LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2876
2877   /* Now the vectorization factor is final.  */
2878   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2879   gcc_assert (known_ne (vectorization_factor, 0U));
2880
2881   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2882     {
2883       dump_printf_loc (MSG_NOTE, vect_location,
2884                        "vectorization_factor = ");
2885       dump_dec (MSG_NOTE, vectorization_factor);
2886       dump_printf (MSG_NOTE, ", niters = %wd\n",
2887                    LOOP_VINFO_INT_NITERS (loop_vinfo));
2888     }
2889
2890   if (max_vf != MAX_VECTORIZATION_FACTOR
2891       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2892     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2893
2894   loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2895
2896   /* Analyze the alignment of the data-refs in the loop.
2897      Fail if a data reference is found that cannot be vectorized.  */
2898
2899   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2900   if (!ok)
2901     {
2902       if (dump_enabled_p ())
2903         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2904                          "bad data alignment.\n");
2905       return ok;
2906     }
2907
2908   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2909      It is important to call pruning after vect_analyze_data_ref_accesses,
2910      since we use grouping information gathered by interleaving analysis.  */
2911   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2912   if (!ok)
2913     return ok;
2914
2915   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2916      vectorization, since we do not want to add extra peeling or
2917      add versioning for alignment.  */
2918   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2919     /* This pass will decide on using loop versioning and/or loop peeling in
2920        order to enhance the alignment of data references in the loop.  */
2921     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2922   if (!ok)
2923     return ok;
2924
2925   if (slp)
2926     {
2927       /* Analyze operations in the SLP instances.  Note this may
2928          remove unsupported SLP instances which makes the above
2929          SLP kind detection invalid.  */
2930       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2931       vect_slp_analyze_operations (loop_vinfo);
2932       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2933         {
2934           ok = opt_result::failure_at (vect_location,
2935                                        "unsupported SLP instances\n");
2936           goto again;
2937         }
2938
2939       /* Check whether any load in ALL SLP instances is possibly permuted.  */
2940       slp_tree load_node, slp_root;
2941       unsigned i, x;
2942       slp_instance instance;
2943       bool can_use_lanes = true;
2944       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2945         {
2946           slp_root = SLP_INSTANCE_TREE (instance);
2947           int group_size = SLP_TREE_LANES (slp_root);
2948           tree vectype = SLP_TREE_VECTYPE (slp_root);
2949           bool loads_permuted = false;
2950           FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2951             {
2952               if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2953                 continue;
2954               unsigned j;
2955               stmt_vec_info load_info;
2956               FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2957                 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2958                   {
2959                     loads_permuted = true;
2960                     break;
2961                   }
2962             }
2963
2964           /* If the loads and stores can be handled with load/store-lane
2965              instructions record it and move on to the next instance.  */
2966           if (loads_permuted
2967               && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2968               && vect_store_lanes_supported (vectype, group_size, false)
2969                    != IFN_LAST)
2970             {
2971               FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2972                 if (STMT_VINFO_GROUPED_ACCESS
2973                       (SLP_TREE_REPRESENTATIVE (load_node)))
2974                   {
2975                     stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2976                         (SLP_TREE_REPRESENTATIVE (load_node));
2977                     /* Use SLP for strided accesses (or if we can't
2978                        load-lanes).  */
2979                     if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2980                         || vect_load_lanes_supported
2981                              (STMT_VINFO_VECTYPE (stmt_vinfo),
2982                               DR_GROUP_SIZE (stmt_vinfo), false) == IFN_LAST)
2983                       break;
2984                   }
2985
2986               can_use_lanes
2987                 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
2988
2989               if (can_use_lanes && dump_enabled_p ())
2990                 dump_printf_loc (MSG_NOTE, vect_location,
2991                                  "SLP instance %p can use load/store-lanes\n",
2992                                  (void *) instance);
2993             }
2994           else
2995             {
2996               can_use_lanes = false;
2997               break;
2998             }
2999         }
3000
3001       /* If all SLP instances can use load/store-lanes abort SLP and try again
3002          with SLP disabled.  */
3003       if (can_use_lanes)
3004         {
3005           ok = opt_result::failure_at (vect_location,
3006                                        "Built SLP cancelled: can use "
3007                                        "load/store-lanes\n");
3008           if (dump_enabled_p ())
3009             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3010                              "Built SLP cancelled: all SLP instances support "
3011                              "load/store-lanes\n");
3012           goto again;
3013         }
3014     }
3015
3016   /* Dissolve SLP-only groups.  */
3017   vect_dissolve_slp_only_groups (loop_vinfo);
3018
3019   /* Scan all the remaining operations in the loop that are not subject
3020      to SLP and make sure they are vectorizable.  */
3021   ok = vect_analyze_loop_operations (loop_vinfo);
3022   if (!ok)
3023     {
3024       if (dump_enabled_p ())
3025         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3026                          "bad operation or unsupported loop bound.\n");
3027       return ok;
3028     }
3029
3030   /* For now, we don't expect to mix both masking and length approaches for one
3031      loop, disable it if both are recorded.  */
3032   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3033       && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
3034       && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
3035     {
3036       if (dump_enabled_p ())
3037         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3038                          "can't vectorize a loop with partial vectors"
3039                          " because we don't expect to mix different"
3040                          " approaches with partial vectors for the"
3041                          " same loop.\n");
3042       LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3043     }
3044
3045   /* If we still have the option of using partial vectors,
3046      check whether we can generate the necessary loop controls.  */
3047   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
3048     {
3049       if (!LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
3050         {
3051           if (!vect_verify_full_masking (loop_vinfo)
3052               && !vect_verify_full_masking_avx512 (loop_vinfo))
3053             LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3054         }
3055       else /* !LOOP_VINFO_LENS (loop_vinfo).is_empty () */
3056         if (!vect_verify_loop_lens (loop_vinfo))
3057           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3058     }
3059
3060   /* If we're vectorizing a loop that uses length "controls" and
3061      can iterate more than once, we apply decrementing IV approach
3062      in loop control.  */
3063   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3064       && LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_len
3065       && LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0
3066       && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3067            && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
3068                         LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
3069     LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
3070
3071   /* If a loop uses length controls and has a decrementing loop control IV,
3072      we will normally pass that IV through a MIN_EXPR to calcaluate the
3073      basis for the length controls.  E.g. in a loop that processes one
3074      element per scalar iteration, the number of elements would be
3075      MIN_EXPR <N, VF>, where N is the number of scalar iterations left.
3076
3077      This MIN_EXPR approach allows us to use pointer IVs with an invariant
3078      step, since only the final iteration of the vector loop can have
3079      inactive lanes.
3080
3081      However, some targets have a dedicated instruction for calculating the
3082      preferred length, given the total number of elements that still need to
3083      be processed.  This is encapsulated in the SELECT_VL internal function.
3084
3085      If the target supports SELECT_VL, we can use it instead of MIN_EXPR
3086      to determine the basis for the length controls.  However, unlike the
3087      MIN_EXPR calculation, the SELECT_VL calculation can decide to make
3088      lanes inactive in any iteration of the vector loop, not just the last
3089      iteration.  This SELECT_VL approach therefore requires us to use pointer
3090      IVs with variable steps.
3091
3092      Once we've decided how many elements should be processed by one
3093      iteration of the vector loop, we need to populate the rgroup controls.
3094      If a loop has multiple rgroups, we need to make sure that those rgroups
3095      "line up" (that is, they must be consistent about which elements are
3096      active and which aren't).  This is done by vect_adjust_loop_lens_control.
3097
3098      In principle, it would be possible to use vect_adjust_loop_lens_control
3099      on either the result of a MIN_EXPR or the result of a SELECT_VL.
3100      However:
3101
3102      (1) In practice, it only makes sense to use SELECT_VL when a vector
3103          operation will be controlled directly by the result.  It is not
3104          worth using SELECT_VL if it would only be the input to other
3105          calculations.
3106
3107      (2) If we use SELECT_VL for an rgroup that has N controls, each associated
3108          pointer IV will need N updates by a variable amount (N-1 updates
3109          within the iteration and 1 update to move to the next iteration).
3110
3111      Because of this, we prefer to use the MIN_EXPR approach whenever there
3112      is more than one length control.
3113
3114      In addition, SELECT_VL always operates to a granularity of 1 unit.
3115      If we wanted to use it to control an SLP operation on N consecutive
3116      elements, we would need to make the SELECT_VL inputs measure scalar
3117      iterations (rather than elements) and then multiply the SELECT_VL
3118      result by N.  But using SELECT_VL this way is inefficient because
3119      of (1) above.
3120
3121      2. We don't apply SELECT_VL on single-rgroup when both (1) and (2) are
3122         satisfied:
3123
3124      (1). LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) is true.
3125      (2). LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () is true.
3126
3127      Since SELECT_VL (variable step) will make SCEV analysis failed and then
3128      we will fail to gain benefits of following unroll optimizations. We prefer
3129      using the MIN_EXPR approach in this situation.  */
3130   if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
3131     {
3132       tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
3133       if (direct_internal_fn_supported_p (IFN_SELECT_VL, iv_type,
3134                                           OPTIMIZE_FOR_SPEED)
3135           && LOOP_VINFO_LENS (loop_vinfo).length () == 1
3136           && LOOP_VINFO_LENS (loop_vinfo)[0].factor == 1 && !slp
3137           && (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3138               || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()))
3139         LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true;
3140     }
3141
3142   /* Decide whether this loop_vinfo should use partial vectors or peeling,
3143      assuming that the loop will be used as a main loop.  We will redo
3144      this analysis later if we instead decide to use the loop as an
3145      epilogue loop.  */
3146   ok = vect_determine_partial_vectors_and_peeling (loop_vinfo);
3147   if (!ok)
3148     return ok;
3149
3150   /* If we're vectorizing an epilogue loop, the vectorized loop either needs
3151      to be able to handle fewer than VF scalars, or needs to have a lower VF
3152      than the main loop.  */
3153   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
3154       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3155     {
3156       poly_uint64 unscaled_vf
3157         = exact_div (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
3158                      orig_loop_vinfo->suggested_unroll_factor);
3159       if (maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo), unscaled_vf))
3160         return opt_result::failure_at (vect_location,
3161                                        "Vectorization factor too high for"
3162                                        " epilogue loop.\n");
3163     }
3164
3165   /* Check the costings of the loop make vectorizing worthwhile.  */
3166   res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
3167   if (res < 0)
3168     {
3169       ok = opt_result::failure_at (vect_location,
3170                                    "Loop costings may not be worthwhile.\n");
3171       goto again;
3172     }
3173   if (!res)
3174     return opt_result::failure_at (vect_location,
3175                                    "Loop costings not worthwhile.\n");
3176
3177   /* If an epilogue loop is required make sure we can create one.  */
3178   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
3179       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
3180       || LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3181     {
3182       if (dump_enabled_p ())
3183         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
3184       if (!vect_can_advance_ivs_p (loop_vinfo)
3185           || !slpeel_can_duplicate_loop_p (loop,
3186                                            LOOP_VINFO_IV_EXIT (loop_vinfo),
3187                                            LOOP_VINFO_IV_EXIT (loop_vinfo)))
3188         {
3189           ok = opt_result::failure_at (vect_location,
3190                                        "not vectorized: can't create required "
3191                                        "epilog loop\n");
3192           goto again;
3193         }
3194     }
3195
3196   /* During peeling, we need to check if number of loop iterations is
3197      enough for both peeled prolog loop and vector loop.  This check
3198      can be merged along with threshold check of loop versioning, so
3199      increase threshold for this case if necessary.
3200
3201      If we are analyzing an epilogue we still want to check what its
3202      versioning threshold would be.  If we decide to vectorize the epilogues we
3203      will want to use the lowest versioning threshold of all epilogues and main
3204      loop.  This will enable us to enter a vectorized epilogue even when
3205      versioning the loop.  We can't simply check whether the epilogue requires
3206      versioning though since we may have skipped some versioning checks when
3207      analyzing the epilogue.  For instance, checks for alias versioning will be
3208      skipped when dealing with epilogues as we assume we already checked them
3209      for the main loop.  So instead we always check the 'orig_loop_vinfo'.  */
3210   if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
3211     {
3212       poly_uint64 niters_th = 0;
3213       unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3214
3215       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
3216         {
3217           /* Niters for peeled prolog loop.  */
3218           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3219             {
3220               dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
3221               tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
3222               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
3223             }
3224           else
3225             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3226         }
3227
3228       /* Niters for at least one iteration of vectorized loop.  */
3229       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3230         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3231       /* One additional iteration because of peeling for gap.  */
3232       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3233         niters_th += 1;
3234
3235       /*  Use the same condition as vect_transform_loop to decide when to use
3236           the cost to determine a versioning threshold.  */
3237       if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3238           && ordered_p (th, niters_th))
3239         niters_th = ordered_max (poly_uint64 (th), niters_th);
3240
3241       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
3242     }
3243
3244   gcc_assert (known_eq (vectorization_factor,
3245                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
3246
3247   slp_done_for_suggested_uf = slp;
3248
3249   /* Ok to vectorize!  */
3250   LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
3251   return opt_result::success ();
3252
3253 again:
3254   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
3255   gcc_assert (!ok);
3256
3257   /* Try again with SLP forced off but if we didn't do any SLP there is
3258      no point in re-trying.  */
3259   if (!slp)
3260     return ok;
3261
3262   /* If the slp decision is true when suggested unroll factor is worked
3263      out, and we are applying suggested unroll factor, we don't need to
3264      re-try any more.  */
3265   if (applying_suggested_uf && slp_done_for_suggested_uf)
3266     return ok;
3267
3268   /* If there are reduction chains re-trying will fail anyway.  */
3269   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
3270     return ok;
3271
3272   /* Likewise if the grouped loads or stores in the SLP cannot be handled
3273      via interleaving or lane instructions.  */
3274   slp_instance instance;
3275   slp_tree node;
3276   unsigned i, j;
3277   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
3278     {
3279       stmt_vec_info vinfo;
3280       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
3281       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
3282         continue;
3283       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3284       unsigned int size = DR_GROUP_SIZE (vinfo);
3285       tree vectype = STMT_VINFO_VECTYPE (vinfo);
3286       if (vect_store_lanes_supported (vectype, size, false) == IFN_LAST
3287          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
3288          && ! vect_grouped_store_supported (vectype, size))
3289         return opt_result::failure_at (vinfo->stmt,
3290                                        "unsupported grouped store\n");
3291       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
3292         {
3293           vinfo = SLP_TREE_REPRESENTATIVE (node);
3294           if (STMT_VINFO_GROUPED_ACCESS (vinfo))
3295             {
3296               vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3297               bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
3298               size = DR_GROUP_SIZE (vinfo);
3299               vectype = STMT_VINFO_VECTYPE (vinfo);
3300               if (vect_load_lanes_supported (vectype, size, false) == IFN_LAST
3301                   && ! vect_grouped_load_supported (vectype, single_element_p,
3302                                                     size))
3303                 return opt_result::failure_at (vinfo->stmt,
3304                                                "unsupported grouped load\n");
3305             }
3306         }
3307     }
3308
3309   if (dump_enabled_p ())
3310     dump_printf_loc (MSG_NOTE, vect_location,
3311                      "re-trying with SLP disabled\n");
3312
3313   /* Roll back state appropriately.  No SLP this time.  */
3314   slp = false;
3315   /* Restore vectorization factor as it were without SLP.  */
3316   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
3317   /* Free the SLP instances.  */
3318   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
3319     vect_free_slp_instance (instance);
3320   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
3321   /* Reset SLP type to loop_vect on all stmts.  */
3322   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
3323     {
3324       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3325       for (gimple_stmt_iterator si = gsi_start_phis (bb);
3326            !gsi_end_p (si); gsi_next (&si))
3327         {
3328           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3329           STMT_SLP_TYPE (stmt_info) = loop_vect;
3330           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
3331               || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
3332             {
3333               /* vectorizable_reduction adjusts reduction stmt def-types,
3334                  restore them to that of the PHI.  */
3335               STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
3336                 = STMT_VINFO_DEF_TYPE (stmt_info);
3337               STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
3338                                         (STMT_VINFO_REDUC_DEF (stmt_info)))
3339                 = STMT_VINFO_DEF_TYPE (stmt_info);
3340             }
3341         }
3342       for (gimple_stmt_iterator si = gsi_start_bb (bb);
3343            !gsi_end_p (si); gsi_next (&si))
3344         {
3345           if (is_gimple_debug (gsi_stmt (si)))
3346             continue;
3347           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3348           STMT_SLP_TYPE (stmt_info) = loop_vect;
3349           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
3350             {
3351               stmt_vec_info pattern_stmt_info
3352                 = STMT_VINFO_RELATED_STMT (stmt_info);
3353               if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
3354                 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
3355
3356               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
3357               STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
3358               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
3359                    !gsi_end_p (pi); gsi_next (&pi))
3360                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
3361                   = loop_vect;
3362             }
3363         }
3364     }
3365   /* Free optimized alias test DDRS.  */
3366   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
3367   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
3368   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
3369   /* Reset target cost data.  */
3370   delete loop_vinfo->vector_costs;
3371   loop_vinfo->vector_costs = nullptr;
3372   /* Reset accumulated rgroup information.  */
3373   LOOP_VINFO_MASKS (loop_vinfo).mask_set.empty ();
3374   release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
3375   release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
3376   /* Reset assorted flags.  */
3377   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
3378   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
3379   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
3380   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
3381   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3382     = saved_can_use_partial_vectors_p;
3383
3384   goto start_over;
3385 }
3386
3387 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
3388    to be better than vectorizing it using OLD_LOOP_VINFO.  Assume that
3389    OLD_LOOP_VINFO is better unless something specifically indicates
3390    otherwise.
3391
3392    Note that this deliberately isn't a partial order.  */
3393
3394 static bool
3395 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
3396                           loop_vec_info old_loop_vinfo)
3397 {
3398   struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
3399   gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
3400
3401   poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
3402   poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
3403
3404   /* Always prefer a VF of loop->simdlen over any other VF.  */
3405   if (loop->simdlen)
3406     {
3407       bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
3408       bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
3409       if (new_simdlen_p != old_simdlen_p)
3410         return new_simdlen_p;
3411     }
3412
3413   const auto *old_costs = old_loop_vinfo->vector_costs;
3414   const auto *new_costs = new_loop_vinfo->vector_costs;
3415   if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
3416     return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
3417
3418   return new_costs->better_main_loop_than_p (old_costs);
3419 }
3420
3421 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO.  Return
3422    true if we should.  */
3423
3424 static bool
3425 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
3426                         loop_vec_info old_loop_vinfo)
3427 {
3428   if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
3429     return false;
3430
3431   if (dump_enabled_p ())
3432     dump_printf_loc (MSG_NOTE, vect_location,
3433                      "***** Preferring vector mode %s to vector mode %s\n",
3434                      GET_MODE_NAME (new_loop_vinfo->vector_mode),
3435                      GET_MODE_NAME (old_loop_vinfo->vector_mode));
3436   return true;
3437 }
3438
3439 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
3440    not NULL.  Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
3441    MODE_I to the next mode useful to analyze.
3442    Return the loop_vinfo on success and wrapped null on failure.  */
3443
3444 static opt_loop_vec_info
3445 vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
3446                      const vect_loop_form_info *loop_form_info,
3447                      loop_vec_info main_loop_vinfo,
3448                      const vector_modes &vector_modes, unsigned &mode_i,
3449                      machine_mode &autodetected_vector_mode,
3450                      bool &fatal)
3451 {
3452   loop_vec_info loop_vinfo
3453     = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3454
3455   machine_mode vector_mode = vector_modes[mode_i];
3456   loop_vinfo->vector_mode = vector_mode;
3457   unsigned int suggested_unroll_factor = 1;
3458   bool slp_done_for_suggested_uf = false;
3459
3460   /* Run the main analysis.  */
3461   opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,
3462                                         &suggested_unroll_factor,
3463                                         slp_done_for_suggested_uf);
3464   if (dump_enabled_p ())
3465     dump_printf_loc (MSG_NOTE, vect_location,
3466                      "***** Analysis %s with vector mode %s\n",
3467                      res ? "succeeded" : " failed",
3468                      GET_MODE_NAME (loop_vinfo->vector_mode));
3469
3470   if (res && !main_loop_vinfo && suggested_unroll_factor > 1)
3471     {
3472       if (dump_enabled_p ())
3473         dump_printf_loc (MSG_NOTE, vect_location,
3474                          "***** Re-trying analysis for unrolling"
3475                          " with unroll factor %d and slp %s.\n",
3476                          suggested_unroll_factor,
3477                          slp_done_for_suggested_uf ? "on" : "off");
3478       loop_vec_info unroll_vinfo
3479         = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3480       unroll_vinfo->vector_mode = vector_mode;
3481       unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
3482       opt_result new_res = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL,
3483                                                 slp_done_for_suggested_uf);
3484       if (new_res)
3485         {
3486           delete loop_vinfo;
3487           loop_vinfo = unroll_vinfo;
3488         }
3489       else
3490         delete unroll_vinfo;
3491     }
3492
3493   /* Remember the autodetected vector mode.  */
3494   if (vector_mode == VOIDmode)
3495     autodetected_vector_mode = loop_vinfo->vector_mode;
3496
3497   /* Advance mode_i, first skipping modes that would result in the
3498      same analysis result.  */
3499   while (mode_i + 1 < vector_modes.length ()
3500          && vect_chooses_same_modes_p (loop_vinfo,
3501                                        vector_modes[mode_i + 1]))
3502     {
3503       if (dump_enabled_p ())
3504         dump_printf_loc (MSG_NOTE, vect_location,
3505                          "***** The result for vector mode %s would"
3506                          " be the same\n",
3507                          GET_MODE_NAME (vector_modes[mode_i + 1]));
3508       mode_i += 1;
3509     }
3510   if (mode_i + 1 < vector_modes.length ()
3511       && VECTOR_MODE_P (autodetected_vector_mode)
3512       && (related_vector_mode (vector_modes[mode_i + 1],
3513                                GET_MODE_INNER (autodetected_vector_mode))
3514           == autodetected_vector_mode)
3515       && (related_vector_mode (autodetected_vector_mode,
3516                                GET_MODE_INNER (vector_modes[mode_i + 1]))
3517           == vector_modes[mode_i + 1]))
3518     {
3519       if (dump_enabled_p ())
3520         dump_printf_loc (MSG_NOTE, vect_location,
3521                          "***** Skipping vector mode %s, which would"
3522                          " repeat the analysis for %s\n",
3523                          GET_MODE_NAME (vector_modes[mode_i + 1]),
3524                          GET_MODE_NAME (autodetected_vector_mode));
3525       mode_i += 1;
3526     }
3527   mode_i++;
3528
3529   if (!res)
3530     {
3531       delete loop_vinfo;
3532       if (fatal)
3533         gcc_checking_assert (main_loop_vinfo == NULL);
3534       return opt_loop_vec_info::propagate_failure (res);
3535     }
3536
3537   return opt_loop_vec_info::success (loop_vinfo);
3538 }
3539
3540 /* Function vect_analyze_loop.
3541
3542    Apply a set of analyses on LOOP, and create a loop_vec_info struct
3543    for it.  The different analyses will record information in the
3544    loop_vec_info struct.  */
3545 opt_loop_vec_info
3546 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
3547 {
3548   DUMP_VECT_SCOPE ("analyze_loop_nest");
3549
3550   if (loop_outer (loop)
3551       && loop_vec_info_for_loop (loop_outer (loop))
3552       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
3553     return opt_loop_vec_info::failure_at (vect_location,
3554                                           "outer-loop already vectorized.\n");
3555
3556   if (!find_loop_nest (loop, &shared->loop_nest))
3557     return opt_loop_vec_info::failure_at
3558       (vect_location,
3559        "not vectorized: loop nest containing two or more consecutive inner"
3560        " loops cannot be vectorized\n");
3561
3562   /* Analyze the loop form.  */
3563   vect_loop_form_info loop_form_info;
3564   opt_result res = vect_analyze_loop_form (loop, &loop_form_info);
3565   if (!res)
3566     {
3567       if (dump_enabled_p ())
3568         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3569                          "bad loop form.\n");
3570       return opt_loop_vec_info::propagate_failure (res);
3571     }
3572   if (!integer_onep (loop_form_info.assumptions))
3573     {
3574       /* We consider to vectorize this loop by versioning it under
3575          some assumptions.  In order to do this, we need to clear
3576          existing information computed by scev and niter analyzer.  */
3577       scev_reset_htab ();
3578       free_numbers_of_iterations_estimates (loop);
3579       /* Also set flag for this loop so that following scev and niter
3580          analysis are done under the assumptions.  */
3581       loop_constraint_set (loop, LOOP_C_FINITE);
3582     }
3583   else
3584     /* Clear the existing niter information to make sure the nonwrapping flag
3585        will be calculated and set propriately.  */
3586     free_numbers_of_iterations_estimates (loop);
3587
3588   auto_vector_modes vector_modes;
3589   /* Autodetect first vector size we try.  */
3590   vector_modes.safe_push (VOIDmode);
3591   unsigned int autovec_flags
3592     = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
3593                                                     loop->simdlen != 0);
3594   bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
3595                              && !unlimited_cost_model (loop));
3596   machine_mode autodetected_vector_mode = VOIDmode;
3597   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
3598   unsigned int mode_i = 0;
3599   unsigned HOST_WIDE_INT simdlen = loop->simdlen;
3600
3601   /* Keep track of the VF for each mode.  Initialize all to 0 which indicates
3602      a mode has not been analyzed.  */
3603   auto_vec<poly_uint64, 8> cached_vf_per_mode;
3604   for (unsigned i = 0; i < vector_modes.length (); ++i)
3605     cached_vf_per_mode.safe_push (0);
3606
3607   /* First determine the main loop vectorization mode, either the first
3608      one that works, starting with auto-detecting the vector mode and then
3609      following the targets order of preference, or the one with the
3610      lowest cost if pick_lowest_cost_p.  */
3611   while (1)
3612     {
3613       bool fatal;
3614       unsigned int last_mode_i = mode_i;
3615       /* Set cached VF to -1 prior to analysis, which indicates a mode has
3616          failed.  */
3617       cached_vf_per_mode[last_mode_i] = -1;
3618       opt_loop_vec_info loop_vinfo
3619         = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3620                                NULL, vector_modes, mode_i,
3621                                autodetected_vector_mode, fatal);
3622       if (fatal)
3623         break;
3624
3625       if (loop_vinfo)
3626         {
3627           /*  Analyzis has been successful so update the VF value.  The
3628               VF should always be a multiple of unroll_factor and we want to
3629               capture the original VF here.  */
3630           cached_vf_per_mode[last_mode_i]
3631             = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
3632                          loop_vinfo->suggested_unroll_factor);
3633           /* Once we hit the desired simdlen for the first time,
3634              discard any previous attempts.  */
3635           if (simdlen
3636               && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3637             {
3638               delete first_loop_vinfo;
3639               first_loop_vinfo = opt_loop_vec_info::success (NULL);
3640               simdlen = 0;
3641             }
3642           else if (pick_lowest_cost_p
3643                    && first_loop_vinfo
3644                    && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3645             {
3646               /* Pick loop_vinfo over first_loop_vinfo.  */
3647               delete first_loop_vinfo;
3648               first_loop_vinfo = opt_loop_vec_info::success (NULL);
3649             }
3650           if (first_loop_vinfo == NULL)
3651             first_loop_vinfo = loop_vinfo;
3652           else
3653             {
3654               delete loop_vinfo;
3655               loop_vinfo = opt_loop_vec_info::success (NULL);
3656             }
3657
3658           /* Commit to first_loop_vinfo if we have no reason to try
3659              alternatives.  */
3660           if (!simdlen && !pick_lowest_cost_p)
3661             break;
3662         }
3663       if (mode_i == vector_modes.length ()
3664           || autodetected_vector_mode == VOIDmode)
3665         break;
3666
3667       /* Try the next biggest vector size.  */
3668       if (dump_enabled_p ())
3669         dump_printf_loc (MSG_NOTE, vect_location,
3670                          "***** Re-trying analysis with vector mode %s\n",
3671                          GET_MODE_NAME (vector_modes[mode_i]));
3672     }
3673   if (!first_loop_vinfo)
3674     return opt_loop_vec_info::propagate_failure (res);
3675
3676   if (dump_enabled_p ())
3677     dump_printf_loc (MSG_NOTE, vect_location,
3678                      "***** Choosing vector mode %s\n",
3679                      GET_MODE_NAME (first_loop_vinfo->vector_mode));
3680
3681   /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3682      enabled, SIMDUID is not set, it is the innermost loop and we have
3683      either already found the loop's SIMDLEN or there was no SIMDLEN to
3684      begin with.
3685      TODO: Enable epilogue vectorization for loops with SIMDUID set.  */
3686   bool vect_epilogues = (!simdlen
3687                          && loop->inner == NULL
3688                          && param_vect_epilogues_nomask
3689                          && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3690                            /* No code motion support for multiple epilogues so for now
3691                               not supported when multiple exits.  */
3692                          && !LOOP_VINFO_EARLY_BREAKS (first_loop_vinfo)
3693                          && !loop->simduid);
3694   if (!vect_epilogues)
3695     return first_loop_vinfo;
3696
3697   /* Now analyze first_loop_vinfo for epilogue vectorization.  */
3698   poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3699
3700   /* For epilogues start the analysis from the first mode.  The motivation
3701      behind starting from the beginning comes from cases where the VECTOR_MODES
3702      array may contain length-agnostic and length-specific modes.  Their
3703      ordering is not guaranteed, so we could end up picking a mode for the main
3704      loop that is after the epilogue's optimal mode.  */
3705   vector_modes[0] = autodetected_vector_mode;
3706   mode_i = 0;
3707
3708   bool supports_partial_vectors =
3709     partial_vectors_supported_p () && param_vect_partial_vector_usage != 0;
3710   poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3711
3712   while (1)
3713     {
3714       /* If the target does not support partial vectors we can shorten the
3715          number of modes to analyze for the epilogue as we know we can't pick a
3716          mode that would lead to a VF at least as big as the
3717          FIRST_VINFO_VF.  */
3718       if (!supports_partial_vectors
3719           && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3720         {
3721           mode_i++;
3722           if (mode_i == vector_modes.length ())
3723             break;
3724           continue;
3725         }
3726
3727       if (dump_enabled_p ())
3728         dump_printf_loc (MSG_NOTE, vect_location,
3729                          "***** Re-trying epilogue analysis with vector "
3730                          "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3731
3732       bool fatal;
3733       opt_loop_vec_info loop_vinfo
3734         = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3735                                first_loop_vinfo,
3736                                vector_modes, mode_i,
3737                                autodetected_vector_mode, fatal);
3738       if (fatal)
3739         break;
3740
3741       if (loop_vinfo)
3742         {
3743           if (pick_lowest_cost_p)
3744             {
3745               /* Keep trying to roll back vectorization attempts while the
3746                  loop_vec_infos they produced were worse than this one.  */
3747               vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3748               while (!vinfos.is_empty ()
3749                      && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3750                 {
3751                   gcc_assert (vect_epilogues);
3752                   delete vinfos.pop ();
3753                 }
3754             }
3755           /* For now only allow one epilogue loop.  */
3756           if (first_loop_vinfo->epilogue_vinfos.is_empty ())
3757             {
3758               first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3759               poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3760               gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3761                           || maybe_ne (lowest_th, 0U));
3762               /* Keep track of the known smallest versioning
3763                  threshold.  */
3764               if (ordered_p (lowest_th, th))
3765                 lowest_th = ordered_min (lowest_th, th);
3766             }
3767           else
3768             {
3769               delete loop_vinfo;
3770               loop_vinfo = opt_loop_vec_info::success (NULL);
3771             }
3772
3773           /* For now only allow one epilogue loop, but allow
3774              pick_lowest_cost_p to replace it, so commit to the
3775              first epilogue if we have no reason to try alternatives.  */
3776           if (!pick_lowest_cost_p)
3777             break;
3778         }
3779
3780       if (mode_i == vector_modes.length ())
3781         break;
3782
3783     }
3784
3785   if (!first_loop_vinfo->epilogue_vinfos.is_empty ())
3786     {
3787       LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3788       if (dump_enabled_p ())
3789         dump_printf_loc (MSG_NOTE, vect_location,
3790                          "***** Choosing epilogue vector mode %s\n",
3791                          GET_MODE_NAME
3792                            (first_loop_vinfo->epilogue_vinfos[0]->vector_mode));
3793     }
3794
3795   return first_loop_vinfo;
3796 }
3797
3798 /* Return true if there is an in-order reduction function for CODE, storing
3799    it in *REDUC_FN if so.  */
3800
3801 static bool
3802 fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3803 {
3804   /* We support MINUS_EXPR by negating the operand.  This also preserves an
3805      initial -0.0 since -0.0 - 0.0 (neutral op for MINUS_EXPR) == -0.0 +
3806      (-0.0) = -0.0.  */
3807   if (code == PLUS_EXPR || code == MINUS_EXPR)
3808     {
3809       *reduc_fn = IFN_FOLD_LEFT_PLUS;
3810       return true;
3811     }
3812   return false;
3813 }
3814
3815 /* Function reduction_fn_for_scalar_code
3816
3817    Input:
3818    CODE - tree_code of a reduction operations.
3819
3820    Output:
3821    REDUC_FN - the corresponding internal function to be used to reduce the
3822       vector of partial results into a single scalar result, or IFN_LAST
3823       if the operation is a supported reduction operation, but does not have
3824       such an internal function.
3825
3826    Return FALSE if CODE currently cannot be vectorized as reduction.  */
3827
3828 bool
3829 reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3830 {
3831   if (code.is_tree_code ())
3832     switch (tree_code (code))
3833       {
3834       case MAX_EXPR:
3835         *reduc_fn = IFN_REDUC_MAX;
3836         return true;
3837
3838       case MIN_EXPR:
3839         *reduc_fn = IFN_REDUC_MIN;
3840         return true;
3841
3842       case PLUS_EXPR:
3843         *reduc_fn = IFN_REDUC_PLUS;
3844         return true;
3845
3846       case BIT_AND_EXPR:
3847         *reduc_fn = IFN_REDUC_AND;
3848         return true;
3849
3850       case BIT_IOR_EXPR:
3851         *reduc_fn = IFN_REDUC_IOR;
3852         return true;
3853
3854       case BIT_XOR_EXPR:
3855         *reduc_fn = IFN_REDUC_XOR;
3856         return true;
3857
3858       case MULT_EXPR:
3859       case MINUS_EXPR:
3860         *reduc_fn = IFN_LAST;
3861         return true;
3862
3863       default:
3864         return false;
3865       }
3866   else
3867     switch (combined_fn (code))
3868       {
3869       CASE_CFN_FMAX:
3870         *reduc_fn = IFN_REDUC_FMAX;
3871         return true;
3872
3873       CASE_CFN_FMIN:
3874         *reduc_fn = IFN_REDUC_FMIN;
3875         return true;
3876
3877       default:
3878         return false;
3879       }
3880 }
3881
3882 /* If there is a neutral value X such that a reduction would not be affected
3883    by the introduction of additional X elements, return that X, otherwise
3884    return null.  CODE is the code of the reduction and SCALAR_TYPE is type
3885    of the scalar elements.  If the reduction has just a single initial value
3886    then INITIAL_VALUE is that value, otherwise it is null.
3887    If AS_INITIAL is TRUE the value is supposed to be used as initial value.
3888    In that case no signed zero is returned.  */
3889
3890 tree
3891 neutral_op_for_reduction (tree scalar_type, code_helper code,
3892                           tree initial_value, bool as_initial)
3893 {
3894   if (code.is_tree_code ())
3895     switch (tree_code (code))
3896       {
3897       case DOT_PROD_EXPR:
3898       case SAD_EXPR:
3899       case MINUS_EXPR:
3900       case BIT_IOR_EXPR:
3901       case BIT_XOR_EXPR:
3902         return build_zero_cst (scalar_type);
3903       case WIDEN_SUM_EXPR:
3904       case PLUS_EXPR:
3905         if (!as_initial && HONOR_SIGNED_ZEROS (scalar_type))
3906           return build_real (scalar_type, dconstm0);
3907         else
3908           return build_zero_cst (scalar_type);
3909
3910       case MULT_EXPR:
3911         return build_one_cst (scalar_type);
3912
3913       case BIT_AND_EXPR:
3914         return build_all_ones_cst (scalar_type);
3915
3916       case MAX_EXPR:
3917       case MIN_EXPR:
3918         return initial_value;
3919
3920       default:
3921         return NULL_TREE;
3922       }
3923   else
3924     switch (combined_fn (code))
3925       {
3926       CASE_CFN_FMIN:
3927       CASE_CFN_FMAX:
3928         return initial_value;
3929
3930       default:
3931         return NULL_TREE;
3932       }
3933 }
3934
3935 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
3936    STMT is printed with a message MSG. */
3937
3938 static void
3939 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3940 {
3941   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3942 }
3943
3944 /* Return true if we need an in-order reduction for operation CODE
3945    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3946    overflow must wrap.  */
3947
3948 bool
3949 needs_fold_left_reduction_p (tree type, code_helper code)
3950 {
3951   /* CHECKME: check for !flag_finite_math_only too?  */
3952   if (SCALAR_FLOAT_TYPE_P (type))
3953     {
3954       if (code.is_tree_code ())
3955         switch (tree_code (code))
3956           {
3957           case MIN_EXPR:
3958           case MAX_EXPR:
3959             return false;
3960
3961           default:
3962             return !flag_associative_math;
3963           }
3964       else
3965         switch (combined_fn (code))
3966           {
3967           CASE_CFN_FMIN:
3968           CASE_CFN_FMAX:
3969             return false;
3970
3971           default:
3972             return !flag_associative_math;
3973           }
3974     }
3975
3976   if (INTEGRAL_TYPE_P (type))
3977     return (!code.is_tree_code ()
3978             || !operation_no_trapping_overflow (type, tree_code (code)));
3979
3980   if (SAT_FIXED_POINT_TYPE_P (type))
3981     return true;
3982
3983   return false;
3984 }
3985
3986 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3987    has a handled computation expression.  Store the main reduction
3988    operation in *CODE.  */
3989
3990 static bool
3991 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3992                       tree loop_arg, code_helper *code,
3993                       vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3994 {
3995   auto_bitmap visited;
3996   tree lookfor = PHI_RESULT (phi);
3997   ssa_op_iter curri;
3998   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3999   while (USE_FROM_PTR (curr) != loop_arg)
4000     curr = op_iter_next_use (&curri);
4001   curri.i = curri.numops;
4002   do
4003     {
4004       path.safe_push (std::make_pair (curri, curr));
4005       tree use = USE_FROM_PTR (curr);
4006       if (use == lookfor)
4007         break;
4008       gimple *def = SSA_NAME_DEF_STMT (use);
4009       if (gimple_nop_p (def)
4010           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
4011         {
4012 pop:
4013           do
4014             {
4015               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
4016               curri = x.first;
4017               curr = x.second;
4018               do
4019                 curr = op_iter_next_use (&curri);
4020               /* Skip already visited or non-SSA operands (from iterating
4021                  over PHI args).  */
4022               while (curr != NULL_USE_OPERAND_P
4023                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
4024                          || ! bitmap_set_bit (visited,
4025                                               SSA_NAME_VERSION
4026                                                 (USE_FROM_PTR (curr)))));
4027             }
4028           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
4029           if (curr == NULL_USE_OPERAND_P)
4030             break;
4031         }
4032       else
4033         {
4034           if (gimple_code (def) == GIMPLE_PHI)
4035             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
4036           else
4037             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
4038           while (curr != NULL_USE_OPERAND_P
4039                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
4040                      || ! bitmap_set_bit (visited,
4041                                           SSA_NAME_VERSION
4042                                             (USE_FROM_PTR (curr)))))
4043             curr = op_iter_next_use (&curri);
4044           if (curr == NULL_USE_OPERAND_P)
4045             goto pop;
4046         }
4047     }
4048   while (1);
4049   if (dump_file && (dump_flags & TDF_DETAILS))
4050     {
4051       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
4052       unsigned i;
4053       std::pair<ssa_op_iter, use_operand_p> *x;
4054       FOR_EACH_VEC_ELT (path, i, x)
4055         dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
4056       dump_printf (MSG_NOTE, "\n");
4057     }
4058
4059   /* Check whether the reduction path detected is valid.  */
4060   bool fail = path.length () == 0;
4061   bool neg = false;
4062   int sign = -1;
4063   *code = ERROR_MARK;
4064   for (unsigned i = 1; i < path.length (); ++i)
4065     {
4066       gimple *use_stmt = USE_STMT (path[i].second);
4067       gimple_match_op op;
4068       if (!gimple_extract_op (use_stmt, &op))
4069         {
4070           fail = true;
4071           break;
4072         }
4073       unsigned int opi = op.num_ops;
4074       if (gassign *assign = dyn_cast<gassign *> (use_stmt))
4075         {
4076           /* The following make sure we can compute the operand index
4077              easily plus it mostly disallows chaining via COND_EXPR condition
4078              operands.  */
4079           for (opi = 0; opi < op.num_ops; ++opi)
4080             if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
4081               break;
4082         }
4083       else if (gcall *call = dyn_cast<gcall *> (use_stmt))
4084         {
4085           for (opi = 0; opi < op.num_ops; ++opi)
4086             if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
4087               break;
4088         }
4089       if (opi == op.num_ops)
4090         {
4091           fail = true;
4092           break;
4093         }
4094       op.code = canonicalize_code (op.code, op.type);
4095       if (op.code == MINUS_EXPR)
4096         {
4097           op.code = PLUS_EXPR;
4098           /* Track whether we negate the reduction value each iteration.  */
4099           if (op.ops[1] == op.ops[opi])
4100             neg = ! neg;
4101         }
4102       if (CONVERT_EXPR_CODE_P (op.code)
4103           && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
4104         ;
4105       else if (*code == ERROR_MARK)
4106         {
4107           *code = op.code;
4108           sign = TYPE_SIGN (op.type);
4109         }
4110       else if (op.code != *code)
4111         {
4112           fail = true;
4113           break;
4114         }
4115       else if ((op.code == MIN_EXPR
4116                 || op.code == MAX_EXPR)
4117                && sign != TYPE_SIGN (op.type))
4118         {
4119           fail = true;
4120           break;
4121         }
4122       /* Check there's only a single stmt the op is used on.  For the
4123          not value-changing tail and the last stmt allow out-of-loop uses.
4124          ???  We could relax this and handle arbitrary live stmts by
4125          forcing a scalar epilogue for example.  */
4126       imm_use_iterator imm_iter;
4127       use_operand_p use_p;
4128       gimple *op_use_stmt;
4129       unsigned cnt = 0;
4130       bool cond_fn_p = op.code.is_internal_fn ()
4131         && (conditional_internal_fn_code (internal_fn (op.code))
4132             != ERROR_MARK);
4133
4134       FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
4135         {
4136         /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
4137            op1 twice (once as definition, once as else) in the same operation.
4138            Allow this.  */
4139           if (cond_fn_p && op_use_stmt == use_stmt)
4140             {
4141               gcall *call = as_a<gcall *> (use_stmt);
4142               unsigned else_pos
4143                 = internal_fn_else_index (internal_fn (op.code));
4144
4145               for (unsigned int j = 0; j < gimple_call_num_args (call); ++j)
4146                 {
4147                   if (j == else_pos)
4148                     continue;
4149                   if (gimple_call_arg (call, j) == op.ops[opi])
4150                     cnt++;
4151                 }
4152             }
4153           else if (!is_gimple_debug (op_use_stmt)
4154                    && (*code != ERROR_MARK
4155                        || flow_bb_inside_loop_p (loop,
4156                                                  gimple_bb (op_use_stmt))))
4157             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4158               cnt++;
4159         }
4160
4161       if (cnt != 1)
4162         {
4163           fail = true;
4164           break;
4165         }
4166     }
4167   return ! fail && ! neg && *code != ERROR_MARK;
4168 }
4169
4170 bool
4171 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
4172                       tree loop_arg, enum tree_code code)
4173 {
4174   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4175   code_helper code_;
4176   return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
4177           && code_ == code);
4178 }
4179
4180
4181
4182 /* Function vect_is_simple_reduction
4183
4184    (1) Detect a cross-iteration def-use cycle that represents a simple
4185    reduction computation.  We look for the following pattern:
4186
4187    loop_header:
4188      a1 = phi < a0, a2 >
4189      a3 = ...
4190      a2 = operation (a3, a1)
4191
4192    or
4193
4194    a3 = ...
4195    loop_header:
4196      a1 = phi < a0, a2 >
4197      a2 = operation (a3, a1)
4198
4199    such that:
4200    1. operation is commutative and associative and it is safe to
4201       change the order of the computation
4202    2. no uses for a2 in the loop (a2 is used out of the loop)
4203    3. no uses of a1 in the loop besides the reduction operation
4204    4. no uses of a1 outside the loop.
4205
4206    Conditions 1,4 are tested here.
4207    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
4208
4209    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
4210    nested cycles.
4211
4212    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
4213    reductions:
4214
4215      a1 = phi < a0, a2 >
4216      inner loop (def of a3)
4217      a2 = phi < a3 >
4218
4219    (4) Detect condition expressions, ie:
4220      for (int i = 0; i < N; i++)
4221        if (a[i] < val)
4222         ret_val = a[i];
4223
4224 */
4225
4226 static stmt_vec_info
4227 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
4228                           bool *double_reduc, bool *reduc_chain_p, bool slp)
4229 {
4230   gphi *phi = as_a <gphi *> (phi_info->stmt);
4231   gimple *phi_use_stmt = NULL;
4232   imm_use_iterator imm_iter;
4233   use_operand_p use_p;
4234
4235   *double_reduc = false;
4236   *reduc_chain_p = false;
4237   STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
4238
4239   tree phi_name = PHI_RESULT (phi);
4240   /* ???  If there are no uses of the PHI result the inner loop reduction
4241      won't be detected as possibly double-reduction by vectorizable_reduction
4242      because that tries to walk the PHI arg from the preheader edge which
4243      can be constant.  See PR60382.  */
4244   if (has_zero_uses (phi_name))
4245     return NULL;
4246   class loop *loop = (gimple_bb (phi))->loop_father;
4247   unsigned nphi_def_loop_uses = 0;
4248   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
4249     {
4250       gimple *use_stmt = USE_STMT (use_p);
4251       if (is_gimple_debug (use_stmt))
4252         continue;
4253
4254       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4255         {
4256           if (dump_enabled_p ())
4257             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4258                              "intermediate value used outside loop.\n");
4259
4260           return NULL;
4261         }
4262
4263       /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
4264          op1 twice (once as definition, once as else) in the same operation.
4265          Only count it as one. */
4266       if (use_stmt != phi_use_stmt)
4267         {
4268           nphi_def_loop_uses++;
4269           phi_use_stmt = use_stmt;
4270         }
4271     }
4272
4273   tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
4274   if (TREE_CODE (latch_def) != SSA_NAME)
4275     {
4276       if (dump_enabled_p ())
4277         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4278                          "reduction: not ssa_name: %T\n", latch_def);
4279       return NULL;
4280     }
4281
4282   stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
4283   if (!def_stmt_info
4284       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
4285     return NULL;
4286
4287   bool nested_in_vect_loop
4288     = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
4289   unsigned nlatch_def_loop_uses = 0;
4290   auto_vec<gphi *, 3> lcphis;
4291   bool inner_loop_of_double_reduc = false;
4292   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
4293     {
4294       gimple *use_stmt = USE_STMT (use_p);
4295       if (is_gimple_debug (use_stmt))
4296         continue;
4297       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4298         nlatch_def_loop_uses++;
4299       else
4300         {
4301           /* We can have more than one loop-closed PHI.  */
4302           lcphis.safe_push (as_a <gphi *> (use_stmt));
4303           if (nested_in_vect_loop
4304               && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
4305                   == vect_double_reduction_def))
4306             inner_loop_of_double_reduc = true;
4307         }
4308     }
4309
4310   /* If we are vectorizing an inner reduction we are executing that
4311      in the original order only in case we are not dealing with a
4312      double reduction.  */
4313   if (nested_in_vect_loop && !inner_loop_of_double_reduc)
4314     {
4315       if (dump_enabled_p ())
4316         report_vect_op (MSG_NOTE, def_stmt_info->stmt,
4317                         "detected nested cycle: ");
4318       return def_stmt_info;
4319     }
4320
4321   /* When the inner loop of a double reduction ends up with more than
4322      one loop-closed PHI we have failed to classify alternate such
4323      PHIs as double reduction, leading to wrong code.  See PR103237.  */
4324   if (inner_loop_of_double_reduc && lcphis.length () != 1)
4325     {
4326       if (dump_enabled_p ())
4327         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4328                          "unhandle double reduction\n");
4329       return NULL;
4330     }
4331
4332   /* If this isn't a nested cycle or if the nested cycle reduction value
4333      is used ouside of the inner loop we cannot handle uses of the reduction
4334      value.  */
4335   if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
4336     {
4337       if (dump_enabled_p ())
4338         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4339                          "reduction used in loop.\n");
4340       return NULL;
4341     }
4342
4343   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
4344      defined in the inner loop.  */
4345   if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
4346     {
4347       tree op1 = PHI_ARG_DEF (def_stmt, 0);
4348       if (gimple_phi_num_args (def_stmt) != 1
4349           || TREE_CODE (op1) != SSA_NAME)
4350         {
4351           if (dump_enabled_p ())
4352             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4353                              "unsupported phi node definition.\n");
4354
4355           return NULL;
4356         }
4357
4358       /* Verify there is an inner cycle composed of the PHI phi_use_stmt
4359          and the latch definition op1.  */
4360       gimple *def1 = SSA_NAME_DEF_STMT (op1);
4361       if (gimple_bb (def1)
4362           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4363           && loop->inner
4364           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
4365           && (is_gimple_assign (def1) || is_gimple_call (def1))
4366           && is_a <gphi *> (phi_use_stmt)
4367           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt))
4368           && (op1 == PHI_ARG_DEF_FROM_EDGE (phi_use_stmt,
4369                                             loop_latch_edge (loop->inner))))
4370         {
4371           if (dump_enabled_p ())
4372             report_vect_op (MSG_NOTE, def_stmt,
4373                             "detected double reduction: ");
4374
4375           *double_reduc = true;
4376           return def_stmt_info;
4377         }
4378
4379       return NULL;
4380     }
4381
4382   /* Look for the expression computing latch_def from then loop PHI result.  */
4383   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4384   code_helper code;
4385   if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
4386                             path))
4387     {
4388       STMT_VINFO_REDUC_CODE (phi_info) = code;
4389       if (code == COND_EXPR && !nested_in_vect_loop)
4390         STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
4391
4392       /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
4393          reduction chain for which the additional restriction is that
4394          all operations in the chain are the same.  */
4395       auto_vec<stmt_vec_info, 8> reduc_chain;
4396       unsigned i;
4397       bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
4398       for (i = path.length () - 1; i >= 1; --i)
4399         {
4400           gimple *stmt = USE_STMT (path[i].second);
4401           stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
4402           gimple_match_op op;
4403           if (!gimple_extract_op (stmt, &op))
4404             gcc_unreachable ();
4405           if (gassign *assign = dyn_cast<gassign *> (stmt))
4406             STMT_VINFO_REDUC_IDX (stmt_info)
4407               = path[i].second->use - gimple_assign_rhs1_ptr (assign);
4408           else
4409             {
4410               gcall *call = as_a<gcall *> (stmt);
4411               STMT_VINFO_REDUC_IDX (stmt_info)
4412                 = path[i].second->use - gimple_call_arg_ptr (call, 0);
4413             }
4414           bool leading_conversion = (CONVERT_EXPR_CODE_P (op.code)
4415                                      && (i == 1 || i == path.length () - 1));
4416           if ((op.code != code && !leading_conversion)
4417               /* We can only handle the final value in epilogue
4418                  generation for reduction chains.  */
4419               || (i != 1 && !has_single_use (gimple_get_lhs (stmt))))
4420             is_slp_reduc = false;
4421           /* For reduction chains we support a trailing/leading
4422              conversions.  We do not store those in the actual chain.  */
4423           if (leading_conversion)
4424             continue;
4425           reduc_chain.safe_push (stmt_info);
4426         }
4427       if (slp && is_slp_reduc && reduc_chain.length () > 1)
4428         {
4429           for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
4430             {
4431               REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
4432               REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
4433             }
4434           REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
4435           REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
4436
4437           /* Save the chain for further analysis in SLP detection.  */
4438           LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
4439           REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
4440
4441           *reduc_chain_p = true;
4442           if (dump_enabled_p ())
4443             dump_printf_loc (MSG_NOTE, vect_location,
4444                             "reduction: detected reduction chain\n");
4445         }
4446       else if (dump_enabled_p ())
4447         dump_printf_loc (MSG_NOTE, vect_location,
4448                          "reduction: detected reduction\n");
4449
4450       return def_stmt_info;
4451     }
4452
4453   if (dump_enabled_p ())
4454     dump_printf_loc (MSG_NOTE, vect_location,
4455                      "reduction: unknown pattern\n");
4456
4457   return NULL;
4458 }
4459
4460 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
4461    PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
4462    or -1 if not known.  */
4463
4464 static int
4465 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
4466 {
4467   int assumed_vf = vect_vf_for_cost (loop_vinfo);
4468   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
4469     {
4470       if (dump_enabled_p ())
4471         dump_printf_loc (MSG_NOTE, vect_location,
4472                          "cost model: epilogue peel iters set to vf/2 "
4473                          "because loop iterations are unknown .\n");
4474       return assumed_vf / 2;
4475     }
4476   else
4477     {
4478       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
4479       peel_iters_prologue = MIN (niters, peel_iters_prologue);
4480       int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
4481       /* If we need to peel for gaps, but no peeling is required, we have to
4482          peel VF iterations.  */
4483       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
4484         peel_iters_epilogue = assumed_vf;
4485       return peel_iters_epilogue;
4486     }
4487 }
4488
4489 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
4490 int
4491 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
4492                              int *peel_iters_epilogue,
4493                              stmt_vector_for_cost *scalar_cost_vec,
4494                              stmt_vector_for_cost *prologue_cost_vec,
4495                              stmt_vector_for_cost *epilogue_cost_vec)
4496 {
4497   int retval = 0;
4498
4499   *peel_iters_epilogue
4500     = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
4501
4502   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
4503     {
4504       /* If peeled iterations are known but number of scalar loop
4505          iterations are unknown, count a taken branch per peeled loop.  */
4506       if (peel_iters_prologue > 0)
4507         retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
4508                                    vect_prologue);
4509       if (*peel_iters_epilogue > 0)
4510         retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
4511                                     vect_epilogue);
4512     }
4513
4514   stmt_info_for_cost *si;
4515   int j;
4516   if (peel_iters_prologue)
4517     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4518       retval += record_stmt_cost (prologue_cost_vec,
4519                                   si->count * peel_iters_prologue,
4520                                   si->kind, si->stmt_info, si->misalign,
4521                                   vect_prologue);
4522   if (*peel_iters_epilogue)
4523     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4524       retval += record_stmt_cost (epilogue_cost_vec,
4525                                   si->count * *peel_iters_epilogue,
4526                                   si->kind, si->stmt_info, si->misalign,
4527                                   vect_epilogue);
4528
4529   return retval;
4530 }
4531
4532 /* Function vect_estimate_min_profitable_iters
4533
4534    Return the number of iterations required for the vector version of the
4535    loop to be profitable relative to the cost of the scalar version of the
4536    loop.
4537
4538    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
4539    of iterations for vectorization.  -1 value means loop vectorization
4540    is not profitable.  This returned value may be used for dynamic
4541    profitability check.
4542
4543    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
4544    for static check against estimated number of iterations.  */
4545
4546 static void
4547 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
4548                                     int *ret_min_profitable_niters,
4549                                     int *ret_min_profitable_estimate,
4550                                     unsigned *suggested_unroll_factor)
4551 {
4552   int min_profitable_iters;
4553   int min_profitable_estimate;
4554   int peel_iters_prologue;
4555   int peel_iters_epilogue;
4556   unsigned vec_inside_cost = 0;
4557   int vec_outside_cost = 0;
4558   unsigned vec_prologue_cost = 0;
4559   unsigned vec_epilogue_cost = 0;
4560   int scalar_single_iter_cost = 0;
4561   int scalar_outside_cost = 0;
4562   int assumed_vf = vect_vf_for_cost (loop_vinfo);
4563   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
4564   vector_costs *target_cost_data = loop_vinfo->vector_costs;
4565
4566   /* Cost model disabled.  */
4567   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
4568     {
4569       if (dump_enabled_p ())
4570         dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
4571       *ret_min_profitable_niters = 0;
4572       *ret_min_profitable_estimate = 0;
4573       return;
4574     }
4575
4576   /* Requires loop versioning tests to handle misalignment.  */
4577   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
4578     {
4579       /*  FIXME: Make cost depend on complexity of individual check.  */
4580       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
4581       (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4582       if (dump_enabled_p ())
4583         dump_printf (MSG_NOTE,
4584                      "cost model: Adding cost of checks for loop "
4585                      "versioning to treat misalignment.\n");
4586     }
4587
4588   /* Requires loop versioning with alias checks.  */
4589   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4590     {
4591       /*  FIXME: Make cost depend on complexity of individual check.  */
4592       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
4593       (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4594       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
4595       if (len)
4596         /* Count LEN - 1 ANDs and LEN comparisons.  */
4597         (void) add_stmt_cost (target_cost_data, len * 2 - 1,
4598                               scalar_stmt, vect_prologue);
4599       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
4600       if (len)
4601         {
4602           /* Count LEN - 1 ANDs and LEN comparisons.  */
4603           unsigned int nstmts = len * 2 - 1;
4604           /* +1 for each bias that needs adding.  */
4605           for (unsigned int i = 0; i < len; ++i)
4606             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
4607               nstmts += 1;
4608           (void) add_stmt_cost (target_cost_data, nstmts,
4609                                 scalar_stmt, vect_prologue);
4610         }
4611       if (dump_enabled_p ())
4612         dump_printf (MSG_NOTE,
4613                      "cost model: Adding cost of checks for loop "
4614                      "versioning aliasing.\n");
4615     }
4616
4617   /* Requires loop versioning with niter checks.  */
4618   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
4619     {
4620       /*  FIXME: Make cost depend on complexity of individual check.  */
4621       (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
4622                             NULL, NULL, NULL_TREE, 0, vect_prologue);
4623       if (dump_enabled_p ())
4624         dump_printf (MSG_NOTE,
4625                      "cost model: Adding cost of checks for loop "
4626                      "versioning niters.\n");
4627     }
4628
4629   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4630     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4631                           vect_prologue);
4632
4633   /* Count statements in scalar loop.  Using this as scalar cost for a single
4634      iteration for now.
4635
4636      TODO: Add outer loop support.
4637
4638      TODO: Consider assigning different costs to different scalar
4639      statements.  */
4640
4641   scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost ();
4642
4643   /* Add additional cost for the peeled instructions in prologue and epilogue
4644      loop.  (For fully-masked loops there will be no peeling.)
4645
4646      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4647      at compile-time - we assume it's vf/2 (the worst would be vf-1).
4648
4649      TODO: Build an expression that represents peel_iters for prologue and
4650      epilogue to be used in a run-time test.  */
4651
4652   bool prologue_need_br_taken_cost = false;
4653   bool prologue_need_br_not_taken_cost = false;
4654
4655   /* Calculate peel_iters_prologue.  */
4656   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4657     peel_iters_prologue = 0;
4658   else if (npeel < 0)
4659     {
4660       peel_iters_prologue = assumed_vf / 2;
4661       if (dump_enabled_p ())
4662         dump_printf (MSG_NOTE, "cost model: "
4663                      "prologue peel iters set to vf/2.\n");
4664
4665       /* If peeled iterations are unknown, count a taken branch and a not taken
4666          branch per peeled loop.  Even if scalar loop iterations are known,
4667          vector iterations are not known since peeled prologue iterations are
4668          not known.  Hence guards remain the same.  */
4669       prologue_need_br_taken_cost = true;
4670       prologue_need_br_not_taken_cost = true;
4671     }
4672   else
4673     {
4674       peel_iters_prologue = npeel;
4675       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4676         /* If peeled iterations are known but number of scalar loop
4677            iterations are unknown, count a taken branch per peeled loop.  */
4678         prologue_need_br_taken_cost = true;
4679     }
4680
4681   bool epilogue_need_br_taken_cost = false;
4682   bool epilogue_need_br_not_taken_cost = false;
4683
4684   /* Calculate peel_iters_epilogue.  */
4685   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4686     /* We need to peel exactly one iteration for gaps.  */
4687     peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4688   else if (npeel < 0)
4689     {
4690       /* If peeling for alignment is unknown, loop bound of main loop
4691          becomes unknown.  */
4692       peel_iters_epilogue = assumed_vf / 2;
4693       if (dump_enabled_p ())
4694         dump_printf (MSG_NOTE, "cost model: "
4695                      "epilogue peel iters set to vf/2 because "
4696                      "peeling for alignment is unknown.\n");
4697
4698       /* See the same reason above in peel_iters_prologue calculation.  */
4699       epilogue_need_br_taken_cost = true;
4700       epilogue_need_br_not_taken_cost = true;
4701     }
4702   else
4703     {
4704       peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4705       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4706         /* If peeled iterations are known but number of scalar loop
4707            iterations are unknown, count a taken branch per peeled loop.  */
4708         epilogue_need_br_taken_cost = true;
4709     }
4710
4711   stmt_info_for_cost *si;
4712   int j;
4713   /* Add costs associated with peel_iters_prologue.  */
4714   if (peel_iters_prologue)
4715     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4716       {
4717         (void) add_stmt_cost (target_cost_data,
4718                               si->count * peel_iters_prologue, si->kind,
4719                               si->stmt_info, si->node, si->vectype,
4720                               si->misalign, vect_prologue);
4721       }
4722
4723   /* Add costs associated with peel_iters_epilogue.  */
4724   if (peel_iters_epilogue)
4725     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4726       {
4727         (void) add_stmt_cost (target_cost_data,
4728                               si->count * peel_iters_epilogue, si->kind,
4729                               si->stmt_info, si->node, si->vectype,
4730                               si->misalign, vect_epilogue);
4731       }
4732
4733   /* Add possible cond_branch_taken/cond_branch_not_taken cost.  */
4734
4735   if (prologue_need_br_taken_cost)
4736     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4737                           vect_prologue);
4738
4739   if (prologue_need_br_not_taken_cost)
4740     (void) add_stmt_cost (target_cost_data, 1,
4741                           cond_branch_not_taken, vect_prologue);
4742
4743   if (epilogue_need_br_taken_cost)
4744     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4745                           vect_epilogue);
4746
4747   if (epilogue_need_br_not_taken_cost)
4748     (void) add_stmt_cost (target_cost_data, 1,
4749                           cond_branch_not_taken, vect_epilogue);
4750
4751   /* Take care of special costs for rgroup controls of partial vectors.  */
4752   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4753       && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4754           == vect_partial_vectors_avx512))
4755     {
4756       /* Calculate how many masks we need to generate.  */
4757       unsigned int num_masks = 0;
4758       bool need_saturation = false;
4759       for (auto rgm : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
4760         if (rgm.type)
4761           {
4762             unsigned nvectors = rgm.factor;
4763             num_masks += nvectors;
4764             if (TYPE_PRECISION (TREE_TYPE (rgm.compare_type))
4765                 < TYPE_PRECISION (LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo)))
4766               need_saturation = true;
4767           }
4768
4769       /* ???  The target isn't able to identify the costs below as
4770          producing masks so it cannot penaltize cases where we'd run
4771          out of mask registers for example.  */
4772
4773       /* ???  We are also failing to account for smaller vector masks
4774          we generate by splitting larger masks in vect_get_loop_mask.  */
4775
4776       /* In the worst case, we need to generate each mask in the prologue
4777          and in the loop body.  We need one splat per group and one
4778          compare per mask.
4779
4780          Sometimes the prologue mask will fold to a constant,
4781          so the actual prologue cost might be smaller.  However, it's
4782          simpler and safer to use the worst-case cost; if this ends up
4783          being the tie-breaker between vectorizing or not, then it's
4784          probably better not to vectorize.  */
4785       (void) add_stmt_cost (target_cost_data,
4786                             num_masks
4787                             + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4788                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4789                             vect_prologue);
4790       (void) add_stmt_cost (target_cost_data,
4791                             num_masks
4792                             + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4793                             vector_stmt, NULL, NULL, NULL_TREE, 0, vect_body);
4794
4795       /* When we need saturation we need it both in the prologue and
4796          the epilogue.  */
4797       if (need_saturation)
4798         {
4799           (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4800                                 NULL, NULL, NULL_TREE, 0, vect_prologue);
4801           (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4802                                 NULL, NULL, NULL_TREE, 0, vect_body);
4803         }
4804     }
4805   else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4806            && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4807                == vect_partial_vectors_while_ult))
4808     {
4809       /* Calculate how many masks we need to generate.  */
4810       unsigned int num_masks = 0;
4811       rgroup_controls *rgm;
4812       unsigned int num_vectors_m1;
4813       FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec,
4814                         num_vectors_m1, rgm)
4815         if (rgm->type)
4816           num_masks += num_vectors_m1 + 1;
4817       gcc_assert (num_masks > 0);
4818
4819       /* In the worst case, we need to generate each mask in the prologue
4820          and in the loop body.  One of the loop body mask instructions
4821          replaces the comparison in the scalar loop, and since we don't
4822          count the scalar comparison against the scalar body, we shouldn't
4823          count that vector instruction against the vector body either.
4824
4825          Sometimes we can use unpacks instead of generating prologue
4826          masks and sometimes the prologue mask will fold to a constant,
4827          so the actual prologue cost might be smaller.  However, it's
4828          simpler and safer to use the worst-case cost; if this ends up
4829          being the tie-breaker between vectorizing or not, then it's
4830          probably better not to vectorize.  */
4831       (void) add_stmt_cost (target_cost_data, num_masks,
4832                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4833                             vect_prologue);
4834       (void) add_stmt_cost (target_cost_data, num_masks - 1,
4835                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4836                             vect_body);
4837     }
4838   else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4839     {
4840       /* Referring to the functions vect_set_loop_condition_partial_vectors
4841          and vect_set_loop_controls_directly, we need to generate each
4842          length in the prologue and in the loop body if required. Although
4843          there are some possible optimizations, we consider the worst case
4844          here.  */
4845
4846       bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4847       signed char partial_load_store_bias
4848         = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4849       bool need_iterate_p
4850         = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4851            && !vect_known_niters_smaller_than_vf (loop_vinfo));
4852
4853       /* Calculate how many statements to be added.  */
4854       unsigned int prologue_stmts = 0;
4855       unsigned int body_stmts = 0;
4856
4857       rgroup_controls *rgc;
4858       unsigned int num_vectors_m1;
4859       FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4860         if (rgc->type)
4861           {
4862             /* May need one SHIFT for nitems_total computation.  */
4863             unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4864             if (nitems != 1 && !niters_known_p)
4865               prologue_stmts += 1;
4866
4867             /* May need one MAX and one MINUS for wrap around.  */
4868             if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4869               prologue_stmts += 2;
4870
4871             /* Need one MAX and one MINUS for each batch limit excepting for
4872                the 1st one.  */
4873             prologue_stmts += num_vectors_m1 * 2;
4874
4875             unsigned int num_vectors = num_vectors_m1 + 1;
4876
4877             /* Need to set up lengths in prologue, only one MIN required
4878                for each since start index is zero.  */
4879             prologue_stmts += num_vectors;
4880
4881             /* If we have a non-zero partial load bias, we need one PLUS
4882                to adjust the load length.  */
4883             if (partial_load_store_bias != 0)
4884               body_stmts += 1;
4885
4886             unsigned int length_update_cost = 0;
4887             if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
4888               /* For decrement IV style, Each only need a single SELECT_VL
4889                  or MIN since beginning to calculate the number of elements
4890                  need to be processed in current iteration.  */
4891               length_update_cost = 1;
4892             else
4893               /* For increment IV stype, Each may need two MINs and one MINUS to
4894                  update lengths in body for next iteration.  */
4895               length_update_cost = 3;
4896
4897             if (need_iterate_p)
4898               body_stmts += length_update_cost * num_vectors;
4899           }
4900
4901       (void) add_stmt_cost (target_cost_data, prologue_stmts,
4902                             scalar_stmt, vect_prologue);
4903       (void) add_stmt_cost (target_cost_data, body_stmts,
4904                             scalar_stmt, vect_body);
4905     }
4906
4907   /* FORNOW: The scalar outside cost is incremented in one of the
4908      following ways:
4909
4910      1. The vectorizer checks for alignment and aliasing and generates
4911      a condition that allows dynamic vectorization.  A cost model
4912      check is ANDED with the versioning condition.  Hence scalar code
4913      path now has the added cost of the versioning check.
4914
4915        if (cost > th & versioning_check)
4916          jmp to vector code
4917
4918      Hence run-time scalar is incremented by not-taken branch cost.
4919
4920      2. The vectorizer then checks if a prologue is required.  If the
4921      cost model check was not done before during versioning, it has to
4922      be done before the prologue check.
4923
4924        if (cost <= th)
4925          prologue = scalar_iters
4926        if (prologue == 0)
4927          jmp to vector code
4928        else
4929          execute prologue
4930        if (prologue == num_iters)
4931          go to exit
4932
4933      Hence the run-time scalar cost is incremented by a taken branch,
4934      plus a not-taken branch, plus a taken branch cost.
4935
4936      3. The vectorizer then checks if an epilogue is required.  If the
4937      cost model check was not done before during prologue check, it
4938      has to be done with the epilogue check.
4939
4940        if (prologue == 0)
4941          jmp to vector code
4942        else
4943          execute prologue
4944        if (prologue == num_iters)
4945          go to exit
4946        vector code:
4947          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4948            jmp to epilogue
4949
4950      Hence the run-time scalar cost should be incremented by 2 taken
4951      branches.
4952
4953      TODO: The back end may reorder the BBS's differently and reverse
4954      conditions/branch directions.  Change the estimates below to
4955      something more reasonable.  */
4956
4957   /* If the number of iterations is known and we do not do versioning, we can
4958      decide whether to vectorize at compile time.  Hence the scalar version
4959      do not carry cost model guard costs.  */
4960   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4961       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4962     {
4963       /* Cost model check occurs at versioning.  */
4964       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4965         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4966       else
4967         {
4968           /* Cost model check occurs at prologue generation.  */
4969           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4970             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4971               + vect_get_stmt_cost (cond_branch_not_taken);
4972           /* Cost model check occurs at epilogue generation.  */
4973           else
4974             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4975         }
4976     }
4977
4978   /* Complete the target-specific cost calculations.  */
4979   finish_cost (loop_vinfo->vector_costs, loop_vinfo->scalar_costs,
4980                &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost,
4981                suggested_unroll_factor);
4982
4983   if (suggested_unroll_factor && *suggested_unroll_factor > 1
4984       && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
4985       && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
4986                     *suggested_unroll_factor,
4987                     LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
4988     {
4989       if (dump_enabled_p ())
4990         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4991                          "can't unroll as unrolled vectorization factor larger"
4992                          " than maximum vectorization factor: "
4993                          HOST_WIDE_INT_PRINT_UNSIGNED "\n",
4994                          LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
4995       *suggested_unroll_factor = 1;
4996     }
4997
4998   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4999
5000   if (dump_enabled_p ())
5001     {
5002       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
5003       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
5004                    vec_inside_cost);
5005       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
5006                    vec_prologue_cost);
5007       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
5008                    vec_epilogue_cost);
5009       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
5010                    scalar_single_iter_cost);
5011       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
5012                    scalar_outside_cost);
5013       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
5014                    vec_outside_cost);
5015       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
5016                    peel_iters_prologue);
5017       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
5018                    peel_iters_epilogue);
5019     }
5020
5021   /* Calculate number of iterations required to make the vector version
5022      profitable, relative to the loop bodies only.  The following condition
5023      must hold true:
5024      SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
5025      where
5026      SIC = scalar iteration cost, VIC = vector iteration cost,
5027      VOC = vector outside cost, VF = vectorization factor,
5028      NPEEL = prologue iterations + epilogue iterations,
5029      SOC = scalar outside cost for run time cost model check.  */
5030
5031   int saving_per_viter = (scalar_single_iter_cost * assumed_vf
5032                           - vec_inside_cost);
5033   if (saving_per_viter <= 0)
5034     {
5035       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
5036         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
5037                     "vectorization did not happen for a simd loop");
5038
5039       if (dump_enabled_p ())
5040         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5041                          "cost model: the vector iteration cost = %d "
5042                          "divided by the scalar iteration cost = %d "
5043                          "is greater or equal to the vectorization factor = %d"
5044                          ".\n",
5045                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
5046       *ret_min_profitable_niters = -1;
5047       *ret_min_profitable_estimate = -1;
5048       return;
5049     }
5050
5051   /* ??? The "if" arm is written to handle all cases; see below for what
5052      we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
5053   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5054     {
5055       /* Rewriting the condition above in terms of the number of
5056          vector iterations (vniters) rather than the number of
5057          scalar iterations (niters) gives:
5058
5059          SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
5060
5061          <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
5062
5063          For integer N, X and Y when X > 0:
5064
5065          N * X > Y <==> N >= (Y /[floor] X) + 1.  */
5066       int outside_overhead = (vec_outside_cost
5067                               - scalar_single_iter_cost * peel_iters_prologue
5068                               - scalar_single_iter_cost * peel_iters_epilogue
5069                               - scalar_outside_cost);
5070       /* We're only interested in cases that require at least one
5071          vector iteration.  */
5072       int min_vec_niters = 1;
5073       if (outside_overhead > 0)
5074         min_vec_niters = outside_overhead / saving_per_viter + 1;
5075
5076       if (dump_enabled_p ())
5077         dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
5078                      min_vec_niters);
5079
5080       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5081         {
5082           /* Now that we know the minimum number of vector iterations,
5083              find the minimum niters for which the scalar cost is larger:
5084
5085              SIC * niters > VIC * vniters + VOC - SOC
5086
5087              We know that the minimum niters is no more than
5088              vniters * VF + NPEEL, but it might be (and often is) less
5089              than that if a partial vector iteration is cheaper than the
5090              equivalent scalar code.  */
5091           int threshold = (vec_inside_cost * min_vec_niters
5092                            + vec_outside_cost
5093                            - scalar_outside_cost);
5094           if (threshold <= 0)
5095             min_profitable_iters = 1;
5096           else
5097             min_profitable_iters = threshold / scalar_single_iter_cost + 1;
5098         }
5099       else
5100         /* Convert the number of vector iterations into a number of
5101            scalar iterations.  */
5102         min_profitable_iters = (min_vec_niters * assumed_vf
5103                                 + peel_iters_prologue
5104                                 + peel_iters_epilogue);
5105     }
5106   else
5107     {
5108       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
5109                               * assumed_vf
5110                               - vec_inside_cost * peel_iters_prologue
5111                               - vec_inside_cost * peel_iters_epilogue);
5112       if (min_profitable_iters <= 0)
5113         min_profitable_iters = 0;
5114       else
5115         {
5116           min_profitable_iters /= saving_per_viter;
5117
5118           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
5119               <= (((int) vec_inside_cost * min_profitable_iters)
5120                   + (((int) vec_outside_cost - scalar_outside_cost)
5121                      * assumed_vf)))
5122             min_profitable_iters++;
5123         }
5124     }
5125
5126   if (dump_enabled_p ())
5127     dump_printf (MSG_NOTE,
5128                  "  Calculated minimum iters for profitability: %d\n",
5129                  min_profitable_iters);
5130
5131   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
5132       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
5133     /* We want the vectorized loop to execute at least once.  */
5134     min_profitable_iters = assumed_vf + peel_iters_prologue;
5135   else if (min_profitable_iters < peel_iters_prologue)
5136     /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
5137        vectorized loop executes at least once.  */
5138     min_profitable_iters = peel_iters_prologue;
5139
5140   if (dump_enabled_p ())
5141     dump_printf_loc (MSG_NOTE, vect_location,
5142                      "  Runtime profitability threshold = %d\n",
5143                      min_profitable_iters);
5144
5145   *ret_min_profitable_niters = min_profitable_iters;
5146
5147   /* Calculate number of iterations required to make the vector version
5148      profitable, relative to the loop bodies only.
5149
5150      Non-vectorized variant is SIC * niters and it must win over vector
5151      variant on the expected loop trip count.  The following condition must hold true:
5152      SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
5153
5154   if (vec_outside_cost <= 0)
5155     min_profitable_estimate = 0;
5156   /* ??? This "else if" arm is written to handle all cases; see below for
5157      what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
5158   else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5159     {
5160       /* This is a repeat of the code above, but with + SOC rather
5161          than - SOC.  */
5162       int outside_overhead = (vec_outside_cost
5163                               - scalar_single_iter_cost * peel_iters_prologue
5164                               - scalar_single_iter_cost * peel_iters_epilogue
5165                               + scalar_outside_cost);
5166       int min_vec_niters = 1;
5167       if (outside_overhead > 0)
5168         min_vec_niters = outside_overhead / saving_per_viter + 1;
5169
5170       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5171         {
5172           int threshold = (vec_inside_cost * min_vec_niters
5173                            + vec_outside_cost
5174                            + scalar_outside_cost);
5175           min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
5176         }
5177       else
5178         min_profitable_estimate = (min_vec_niters * assumed_vf
5179                                    + peel_iters_prologue
5180                                    + peel_iters_epilogue);
5181     }
5182   else
5183     {
5184       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
5185                                  * assumed_vf
5186                                  - vec_inside_cost * peel_iters_prologue
5187                                  - vec_inside_cost * peel_iters_epilogue)
5188                                  / ((scalar_single_iter_cost * assumed_vf)
5189                                    - vec_inside_cost);
5190     }
5191   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
5192   if (dump_enabled_p ())
5193     dump_printf_loc (MSG_NOTE, vect_location,
5194                      "  Static estimate profitability threshold = %d\n",
5195                      min_profitable_estimate);
5196
5197   *ret_min_profitable_estimate = min_profitable_estimate;
5198 }
5199
5200 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
5201    vector elements (not bits) for a vector with NELT elements.  */
5202 static void
5203 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
5204                               vec_perm_builder *sel)
5205 {
5206   /* The encoding is a single stepped pattern.  Any wrap-around is handled
5207      by vec_perm_indices.  */
5208   sel->new_vector (nelt, 1, 3);
5209   for (unsigned int i = 0; i < 3; i++)
5210     sel->quick_push (i + offset);
5211 }
5212
5213 /* Checks whether the target supports whole-vector shifts for vectors of mode
5214    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
5215    it supports vec_perm_const with masks for all necessary shift amounts.  */
5216 static bool
5217 have_whole_vector_shift (machine_mode mode)
5218 {
5219   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
5220     return true;
5221
5222   /* Variable-length vectors should be handled via the optab.  */
5223   unsigned int nelt;
5224   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5225     return false;
5226
5227   vec_perm_builder sel;
5228   vec_perm_indices indices;
5229   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
5230     {
5231       calc_vec_perm_mask_for_shift (i, nelt, &sel);
5232       indices.new_vector (sel, 2, nelt);
5233       if (!can_vec_perm_const_p (mode, mode, indices, false))
5234         return false;
5235     }
5236   return true;
5237 }
5238
5239 /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
5240    multiplication operands have differing signs and (b) we intend
5241    to emulate the operation using a series of signed DOT_PROD_EXPRs.
5242    See vect_emulate_mixed_dot_prod for the actual sequence used.  */
5243
5244 static bool
5245 vect_is_emulated_mixed_dot_prod (loop_vec_info loop_vinfo,
5246                                  stmt_vec_info stmt_info)
5247 {
5248   gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
5249   if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
5250     return false;
5251
5252   tree rhs1 = gimple_assign_rhs1 (assign);
5253   tree rhs2 = gimple_assign_rhs2 (assign);
5254   if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
5255     return false;
5256
5257   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5258   gcc_assert (reduc_info->is_reduc_info);
5259   return !directly_supported_p (DOT_PROD_EXPR,
5260                                 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info),
5261                                 optab_vector_mixed_sign);
5262 }
5263
5264 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
5265    functions. Design better to avoid maintenance issues.  */
5266
5267 /* Function vect_model_reduction_cost.
5268
5269    Models cost for a reduction operation, including the vector ops
5270    generated within the strip-mine loop in some cases, the initial
5271    definition before the loop, and the epilogue code that must be generated.  */
5272
5273 static void
5274 vect_model_reduction_cost (loop_vec_info loop_vinfo,
5275                            stmt_vec_info stmt_info, internal_fn reduc_fn,
5276                            vect_reduction_type reduction_type,
5277                            int ncopies, stmt_vector_for_cost *cost_vec)
5278 {
5279   int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
5280   tree vectype;
5281   machine_mode mode;
5282   class loop *loop = NULL;
5283
5284   if (loop_vinfo)
5285     loop = LOOP_VINFO_LOOP (loop_vinfo);
5286
5287   /* Condition reductions generate two reductions in the loop.  */
5288   if (reduction_type == COND_REDUCTION)
5289     ncopies *= 2;
5290
5291   vectype = STMT_VINFO_VECTYPE (stmt_info);
5292   mode = TYPE_MODE (vectype);
5293   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5294
5295   gimple_match_op op;
5296   if (!gimple_extract_op (orig_stmt_info->stmt, &op))
5297     gcc_unreachable ();
5298
5299   bool emulated_mixed_dot_prod
5300     = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
5301   if (reduction_type == EXTRACT_LAST_REDUCTION)
5302     /* No extra instructions are needed in the prologue.  The loop body
5303        operations are costed in vectorizable_condition.  */
5304     inside_cost = 0;
5305   else if (reduction_type == FOLD_LEFT_REDUCTION)
5306     {
5307       /* No extra instructions needed in the prologue.  */
5308       prologue_cost = 0;
5309
5310       if (reduc_fn != IFN_LAST)
5311         /* Count one reduction-like operation per vector.  */
5312         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
5313                                         stmt_info, 0, vect_body);
5314       else
5315         {
5316           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
5317           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
5318           inside_cost = record_stmt_cost (cost_vec, nelements,
5319                                           vec_to_scalar, stmt_info, 0,
5320                                           vect_body);
5321           inside_cost += record_stmt_cost (cost_vec, nelements,
5322                                            scalar_stmt, stmt_info, 0,
5323                                            vect_body);
5324         }
5325     }
5326   else
5327     {
5328       /* Add in the cost of the initial definitions.  */
5329       int prologue_stmts;
5330       if (reduction_type == COND_REDUCTION)
5331         /* For cond reductions we have four vectors: initial index, step,
5332            initial result of the data reduction, initial value of the index
5333            reduction.  */
5334         prologue_stmts = 4;
5335       else if (emulated_mixed_dot_prod)
5336         /* We need the initial reduction value and two invariants:
5337            one that contains the minimum signed value and one that
5338            contains half of its negative.  */
5339         prologue_stmts = 3;
5340       else
5341         prologue_stmts = 1;
5342       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
5343                                          scalar_to_vec, stmt_info, 0,
5344                                          vect_prologue);
5345     }
5346
5347   /* Determine cost of epilogue code.
5348
5349      We have a reduction operator that will reduce the vector in one statement.
5350      Also requires scalar extract.  */
5351
5352   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
5353     {
5354       if (reduc_fn != IFN_LAST)
5355         {
5356           if (reduction_type == COND_REDUCTION)
5357             {
5358               /* An EQ stmt and an COND_EXPR stmt.  */
5359               epilogue_cost += record_stmt_cost (cost_vec, 2,
5360                                                  vector_stmt, stmt_info, 0,
5361                                                  vect_epilogue);
5362               /* Reduction of the max index and a reduction of the found
5363                  values.  */
5364               epilogue_cost += record_stmt_cost (cost_vec, 2,
5365                                                  vec_to_scalar, stmt_info, 0,
5366                                                  vect_epilogue);
5367               /* A broadcast of the max value.  */
5368               epilogue_cost += record_stmt_cost (cost_vec, 1,
5369                                                  scalar_to_vec, stmt_info, 0,
5370                                                  vect_epilogue);
5371             }
5372           else
5373             {
5374               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
5375                                                  stmt_info, 0, vect_epilogue);
5376               epilogue_cost += record_stmt_cost (cost_vec, 1,
5377                                                  vec_to_scalar, stmt_info, 0,
5378                                                  vect_epilogue);
5379             }
5380         }
5381       else if (reduction_type == COND_REDUCTION)
5382         {
5383           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
5384           /* Extraction of scalar elements.  */
5385           epilogue_cost += record_stmt_cost (cost_vec,
5386                                              2 * estimated_nunits,
5387                                              vec_to_scalar, stmt_info, 0,
5388                                              vect_epilogue);
5389           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
5390           epilogue_cost += record_stmt_cost (cost_vec,
5391                                              2 * estimated_nunits - 3,
5392                                              scalar_stmt, stmt_info, 0,
5393                                              vect_epilogue);
5394         }
5395       else if (reduction_type == EXTRACT_LAST_REDUCTION
5396                || reduction_type == FOLD_LEFT_REDUCTION)
5397         /* No extra instructions need in the epilogue.  */
5398         ;
5399       else
5400         {
5401           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5402           tree bitsize = TYPE_SIZE (op.type);
5403           int element_bitsize = tree_to_uhwi (bitsize);
5404           int nelements = vec_size_in_bits / element_bitsize;
5405
5406           if (op.code == COND_EXPR)
5407             op.code = MAX_EXPR;
5408
5409           /* We have a whole vector shift available.  */
5410           if (VECTOR_MODE_P (mode)
5411               && directly_supported_p (op.code, vectype)
5412               && have_whole_vector_shift (mode))
5413             {
5414               /* Final reduction via vector shifts and the reduction operator.
5415                  Also requires scalar extract.  */
5416               epilogue_cost += record_stmt_cost (cost_vec,
5417                                                  exact_log2 (nelements) * 2,
5418                                                  vector_stmt, stmt_info, 0,
5419                                                  vect_epilogue);
5420               epilogue_cost += record_stmt_cost (cost_vec, 1,
5421                                                  vec_to_scalar, stmt_info, 0,
5422                                                  vect_epilogue);
5423             }
5424           else
5425             /* Use extracts and reduction op for final reduction.  For N
5426                elements, we have N extracts and N-1 reduction ops.  */
5427             epilogue_cost += record_stmt_cost (cost_vec,
5428                                                nelements + nelements - 1,
5429                                                vector_stmt, stmt_info, 0,
5430                                                vect_epilogue);
5431         }
5432     }
5433
5434   if (dump_enabled_p ())
5435     dump_printf (MSG_NOTE,
5436                  "vect_model_reduction_cost: inside_cost = %d, "
5437                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
5438                  prologue_cost, epilogue_cost);
5439 }
5440
5441 /* SEQ is a sequence of instructions that initialize the reduction
5442    described by REDUC_INFO.  Emit them in the appropriate place.  */
5443
5444 static void
5445 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
5446                                 stmt_vec_info reduc_info, gimple *seq)
5447 {
5448   if (reduc_info->reused_accumulator)
5449     {
5450       /* When reusing an accumulator from the main loop, we only need
5451          initialization instructions if the main loop can be skipped.
5452          In that case, emit the initialization instructions at the end
5453          of the guard block that does the skip.  */
5454       edge skip_edge = loop_vinfo->skip_main_loop_edge;
5455       gcc_assert (skip_edge);
5456       gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
5457       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
5458     }
5459   else
5460     {
5461       /* The normal case: emit the initialization instructions on the
5462          preheader edge.  */
5463       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5464       gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
5465     }
5466 }
5467
5468 /* Function get_initial_def_for_reduction
5469
5470    Input:
5471    REDUC_INFO - the info_for_reduction
5472    INIT_VAL - the initial value of the reduction variable
5473    NEUTRAL_OP - a value that has no effect on the reduction, as per
5474                 neutral_op_for_reduction
5475
5476    Output:
5477    Return a vector variable, initialized according to the operation that
5478         STMT_VINFO performs. This vector will be used as the initial value
5479         of the vector of partial results.
5480
5481    The value we need is a vector in which element 0 has value INIT_VAL
5482    and every other element has value NEUTRAL_OP.  */
5483
5484 static tree
5485 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
5486                                stmt_vec_info reduc_info,
5487                                tree init_val, tree neutral_op)
5488 {
5489   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5490   tree scalar_type = TREE_TYPE (init_val);
5491   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
5492   tree init_def;
5493   gimple_seq stmts = NULL;
5494
5495   gcc_assert (vectype);
5496
5497   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
5498               || SCALAR_FLOAT_TYPE_P (scalar_type));
5499
5500   gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
5501               || loop == (gimple_bb (reduc_info->stmt))->loop_father);
5502
5503   if (operand_equal_p (init_val, neutral_op))
5504     {
5505       /* If both elements are equal then the vector described above is
5506          just a splat.  */
5507       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5508       init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
5509     }
5510   else
5511     {
5512       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5513       init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
5514       if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
5515         {
5516           /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
5517              element 0.  */
5518           init_def = gimple_build_vector_from_val (&stmts, vectype,
5519                                                    neutral_op);
5520           init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
5521                                    vectype, init_def, init_val);
5522         }
5523       else
5524         {
5525           /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}.  */
5526           tree_vector_builder elts (vectype, 1, 2);
5527           elts.quick_push (init_val);
5528           elts.quick_push (neutral_op);
5529           init_def = gimple_build_vector (&stmts, &elts);
5530         }
5531     }
5532
5533   if (stmts)
5534     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
5535   return init_def;
5536 }
5537
5538 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
5539    which performs a reduction involving GROUP_SIZE scalar statements.
5540    NUMBER_OF_VECTORS is the number of vector defs to create.  If NEUTRAL_OP
5541    is nonnull, introducing extra elements of that value will not change the
5542    result.  */
5543
5544 static void
5545 get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
5546                                 stmt_vec_info reduc_info,
5547                                 vec<tree> *vec_oprnds,
5548                                 unsigned int number_of_vectors,
5549                                 unsigned int group_size, tree neutral_op)
5550 {
5551   vec<tree> &initial_values = reduc_info->reduc_initial_values;
5552   unsigned HOST_WIDE_INT nunits;
5553   unsigned j, number_of_places_left_in_vector;
5554   tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
5555   unsigned int i;
5556
5557   gcc_assert (group_size == initial_values.length () || neutral_op);
5558
5559   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5560      created vectors. It is greater than 1 if unrolling is performed.
5561
5562      For example, we have two scalar operands, s1 and s2 (e.g., group of
5563      strided accesses of size two), while NUNITS is four (i.e., four scalars
5564      of this type can be packed in a vector).  The output vector will contain
5565      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
5566      will be 2).
5567
5568      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
5569      vectors containing the operands.
5570
5571      For example, NUNITS is four as before, and the group size is 8
5572      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
5573      {s5, s6, s7, s8}.  */
5574
5575   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
5576     nunits = group_size;
5577
5578   number_of_places_left_in_vector = nunits;
5579   bool constant_p = true;
5580   tree_vector_builder elts (vector_type, nunits, 1);
5581   elts.quick_grow (nunits);
5582   gimple_seq ctor_seq = NULL;
5583   for (j = 0; j < nunits * number_of_vectors; ++j)
5584     {
5585       tree op;
5586       i = j % group_size;
5587
5588       /* Get the def before the loop.  In reduction chain we have only
5589          one initial value.  Else we have as many as PHIs in the group.  */
5590       if (i >= initial_values.length () || (j > i && neutral_op))
5591         op = neutral_op;
5592       else
5593         op = initial_values[i];
5594
5595       /* Create 'vect_ = {op0,op1,...,opn}'.  */
5596       number_of_places_left_in_vector--;
5597       elts[nunits - number_of_places_left_in_vector - 1] = op;
5598       if (!CONSTANT_CLASS_P (op))
5599         constant_p = false;
5600
5601       if (number_of_places_left_in_vector == 0)
5602         {
5603           tree init;
5604           if (constant_p && !neutral_op
5605               ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
5606               : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
5607             /* Build the vector directly from ELTS.  */
5608             init = gimple_build_vector (&ctor_seq, &elts);
5609           else if (neutral_op)
5610             {
5611               /* Build a vector of the neutral value and shift the
5612                  other elements into place.  */
5613               init = gimple_build_vector_from_val (&ctor_seq, vector_type,
5614                                                    neutral_op);
5615               int k = nunits;
5616               while (k > 0 && elts[k - 1] == neutral_op)
5617                 k -= 1;
5618               while (k > 0)
5619                 {
5620                   k -= 1;
5621                   init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
5622                                        vector_type, init, elts[k]);
5623                 }
5624             }
5625           else
5626             {
5627               /* First time round, duplicate ELTS to fill the
5628                  required number of vectors.  */
5629               duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
5630                                         elts, number_of_vectors, *vec_oprnds);
5631               break;
5632             }
5633           vec_oprnds->quick_push (init);
5634
5635           number_of_places_left_in_vector = nunits;
5636           elts.new_vector (vector_type, nunits, 1);
5637           elts.quick_grow (nunits);
5638           constant_p = true;
5639         }
5640     }
5641   if (ctor_seq != NULL)
5642     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
5643 }
5644
5645 /* For a statement STMT_INFO taking part in a reduction operation return
5646    the stmt_vec_info the meta information is stored on.  */
5647
5648 stmt_vec_info
5649 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
5650 {
5651   stmt_info = vect_orig_stmt (stmt_info);
5652   gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
5653   if (!is_a <gphi *> (stmt_info->stmt)
5654       || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
5655     stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5656   gphi *phi = as_a <gphi *> (stmt_info->stmt);
5657   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5658     {
5659       if (gimple_phi_num_args (phi) == 1)
5660         stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5661     }
5662   else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
5663     {
5664       stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
5665       if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
5666         stmt_info = info;
5667     }
5668   return stmt_info;
5669 }
5670
5671 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
5672    REDUC_INFO can build on.  Adjust REDUC_INFO and return true if so, otherwise
5673    return false.  */
5674
5675 static bool
5676 vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
5677                                 stmt_vec_info reduc_info)
5678 {
5679   loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
5680   if (!main_loop_vinfo)
5681     return false;
5682
5683   if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
5684     return false;
5685
5686   unsigned int num_phis = reduc_info->reduc_initial_values.length ();
5687   auto_vec<tree, 16> main_loop_results (num_phis);
5688   auto_vec<tree, 16> initial_values (num_phis);
5689   if (edge main_loop_edge = loop_vinfo->main_loop_edge)
5690     {
5691       /* The epilogue loop can be entered either from the main loop or
5692          from an earlier guard block.  */
5693       edge skip_edge = loop_vinfo->skip_main_loop_edge;
5694       for (tree incoming_value : reduc_info->reduc_initial_values)
5695         {
5696           /* Look for:
5697
5698                INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
5699                                     INITIAL_VALUE(guard block)>.  */
5700           gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
5701
5702           gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
5703           gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
5704
5705           tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
5706           tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
5707
5708           main_loop_results.quick_push (from_main_loop);
5709           initial_values.quick_push (from_skip);
5710         }
5711     }
5712   else
5713     /* The main loop dominates the epilogue loop.  */
5714     main_loop_results.splice (reduc_info->reduc_initial_values);
5715
5716   /* See if the main loop has the kind of accumulator we need.  */
5717   vect_reusable_accumulator *accumulator
5718     = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
5719   if (!accumulator
5720       || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
5721       || !std::equal (main_loop_results.begin (), main_loop_results.end (),
5722                       accumulator->reduc_info->reduc_scalar_results.begin ()))
5723     return false;
5724
5725   /* Handle the case where we can reduce wider vectors to narrower ones.  */
5726   tree vectype = STMT_VINFO_VECTYPE (reduc_info);
5727   tree old_vectype = TREE_TYPE (accumulator->reduc_input);
5728   unsigned HOST_WIDE_INT m;
5729   if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
5730                             TYPE_VECTOR_SUBPARTS (vectype), &m))
5731     return false;
5732   /* Check the intermediate vector types and operations are available.  */
5733   tree prev_vectype = old_vectype;
5734   poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
5735   while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
5736     {
5737       intermediate_nunits = exact_div (intermediate_nunits, 2);
5738       tree intermediate_vectype = get_related_vectype_for_scalar_type
5739         (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
5740       if (!intermediate_vectype
5741           || !directly_supported_p (STMT_VINFO_REDUC_CODE (reduc_info),
5742                                     intermediate_vectype)
5743           || !can_vec_extract (TYPE_MODE (prev_vectype),
5744                                TYPE_MODE (intermediate_vectype)))
5745         return false;
5746       prev_vectype = intermediate_vectype;
5747     }
5748
5749   /* Non-SLP reductions might apply an adjustment after the reduction
5750      operation, in order to simplify the initialization of the accumulator.
5751      If the epilogue loop carries on from where the main loop left off,
5752      it should apply the same adjustment to the final reduction result.
5753
5754      If the epilogue loop can also be entered directly (rather than via
5755      the main loop), we need to be able to handle that case in the same way,
5756      with the same adjustment.  (In principle we could add a PHI node
5757      to select the correct adjustment, but in practice that shouldn't be
5758      necessary.)  */
5759   tree main_adjustment
5760     = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5761   if (loop_vinfo->main_loop_edge && main_adjustment)
5762     {
5763       gcc_assert (num_phis == 1);
5764       tree initial_value = initial_values[0];
5765       /* Check that we can use INITIAL_VALUE as the adjustment and
5766          initialize the accumulator with a neutral value instead.  */
5767       if (!operand_equal_p (initial_value, main_adjustment))
5768         return false;
5769       code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5770       initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
5771                                                     code, initial_value);
5772     }
5773   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5774   reduc_info->reduc_initial_values.truncate (0);
5775   reduc_info->reduc_initial_values.splice (initial_values);
5776   reduc_info->reused_accumulator = accumulator;
5777   return true;
5778 }
5779
5780 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5781    CODE emitting stmts before GSI.  Returns a vector def of VECTYPE.  */
5782
5783 static tree
5784 vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5785                             gimple_seq *seq)
5786 {
5787   unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5788   unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5789   tree stype = TREE_TYPE (vectype);
5790   tree new_temp = vec_def;
5791   while (nunits > nunits1)
5792     {
5793       nunits /= 2;
5794       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5795                                                            stype, nunits);
5796       unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5797
5798       /* The target has to make sure we support lowpart/highpart
5799          extraction, either via direct vector extract or through
5800          an integer mode punning.  */
5801       tree dst1, dst2;
5802       gimple *epilog_stmt;
5803       if (convert_optab_handler (vec_extract_optab,
5804                                  TYPE_MODE (TREE_TYPE (new_temp)),
5805                                  TYPE_MODE (vectype1))
5806           != CODE_FOR_nothing)
5807         {
5808           /* Extract sub-vectors directly once vec_extract becomes
5809              a conversion optab.  */
5810           dst1 = make_ssa_name (vectype1);
5811           epilog_stmt
5812               = gimple_build_assign (dst1, BIT_FIELD_REF,
5813                                      build3 (BIT_FIELD_REF, vectype1,
5814                                              new_temp, TYPE_SIZE (vectype1),
5815                                              bitsize_int (0)));
5816           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5817           dst2 =  make_ssa_name (vectype1);
5818           epilog_stmt
5819               = gimple_build_assign (dst2, BIT_FIELD_REF,
5820                                      build3 (BIT_FIELD_REF, vectype1,
5821                                              new_temp, TYPE_SIZE (vectype1),
5822                                              bitsize_int (bitsize)));
5823           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5824         }
5825       else
5826         {
5827           /* Extract via punning to appropriately sized integer mode
5828              vector.  */
5829           tree eltype = build_nonstandard_integer_type (bitsize, 1);
5830           tree etype = build_vector_type (eltype, 2);
5831           gcc_assert (convert_optab_handler (vec_extract_optab,
5832                                              TYPE_MODE (etype),
5833                                              TYPE_MODE (eltype))
5834                       != CODE_FOR_nothing);
5835           tree tem = make_ssa_name (etype);
5836           epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5837                                              build1 (VIEW_CONVERT_EXPR,
5838                                                      etype, new_temp));
5839           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5840           new_temp = tem;
5841           tem = make_ssa_name (eltype);
5842           epilog_stmt
5843               = gimple_build_assign (tem, BIT_FIELD_REF,
5844                                      build3 (BIT_FIELD_REF, eltype,
5845                                              new_temp, TYPE_SIZE (eltype),
5846                                              bitsize_int (0)));
5847           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5848           dst1 = make_ssa_name (vectype1);
5849           epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5850                                              build1 (VIEW_CONVERT_EXPR,
5851                                                      vectype1, tem));
5852           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5853           tem = make_ssa_name (eltype);
5854           epilog_stmt
5855               = gimple_build_assign (tem, BIT_FIELD_REF,
5856                                      build3 (BIT_FIELD_REF, eltype,
5857                                              new_temp, TYPE_SIZE (eltype),
5858                                              bitsize_int (bitsize)));
5859           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5860           dst2 =  make_ssa_name (vectype1);
5861           epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5862                                              build1 (VIEW_CONVERT_EXPR,
5863                                                      vectype1, tem));
5864           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5865         }
5866
5867       new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5868     }
5869
5870   return new_temp;
5871 }
5872
5873 /* Retrieves the definining statement to be used for a reduction.
5874    For MAIN_EXIT_P we use the current VEC_STMTs and otherwise we look at
5875    the reduction definitions.  */
5876
5877 tree
5878 vect_get_vect_def (stmt_vec_info reduc_info, slp_tree slp_node,
5879                    slp_instance slp_node_instance, bool main_exit_p, unsigned i,
5880                    vec <gimple *> &vec_stmts)
5881 {
5882   tree def;
5883
5884   if (slp_node)
5885     {
5886       if (!main_exit_p)
5887         slp_node = slp_node_instance->reduc_phis;
5888       def = vect_get_slp_vect_def (slp_node, i);
5889     }
5890   else
5891     {
5892       if (!main_exit_p)
5893         reduc_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (reduc_info));
5894       vec_stmts = STMT_VINFO_VEC_STMTS (reduc_info);
5895       def = gimple_get_lhs (vec_stmts[0]);
5896     }
5897
5898   return def;
5899 }
5900
5901 /* Function vect_create_epilog_for_reduction
5902
5903    Create code at the loop-epilog to finalize the result of a reduction
5904    computation.
5905
5906    STMT_INFO is the scalar reduction stmt that is being vectorized.
5907    SLP_NODE is an SLP node containing a group of reduction statements. The
5908      first one in this group is STMT_INFO.
5909    SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5910    REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5911      (counting from 0)
5912    LOOP_EXIT is the edge to update in the merge block.  In the case of a single
5913      exit this edge is always the main loop exit.
5914
5915    This function:
5916    1. Completes the reduction def-use cycles.
5917    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5918       by calling the function specified by REDUC_FN if available, or by
5919       other means (whole-vector shifts or a scalar loop).
5920       The function also creates a new phi node at the loop exit to preserve
5921       loop-closed form, as illustrated below.
5922
5923      The flow at the entry to this function:
5924
5925         loop:
5926           vec_def = phi <vec_init, null>        # REDUCTION_PHI
5927           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5928           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5929         loop_exit:
5930           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5931           use <s_out0>
5932           use <s_out0>
5933
5934      The above is transformed by this function into:
5935
5936         loop:
5937           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
5938           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5939           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5940         loop_exit:
5941           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5942           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5943           v_out2 = reduce <v_out1>
5944           s_out3 = extract_field <v_out2, 0>
5945           s_out4 = adjust_result <s_out3>
5946           use <s_out4>
5947           use <s_out4>
5948 */
5949
5950 static void
5951 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5952                                   stmt_vec_info stmt_info,
5953                                   slp_tree slp_node,
5954                                   slp_instance slp_node_instance,
5955                                   edge loop_exit)
5956 {
5957   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5958   gcc_assert (reduc_info->is_reduc_info);
5959   /* For double reductions we need to get at the inner loop reduction
5960      stmt which has the meta info attached.  Our stmt_info is that of the
5961      loop-closed PHI of the inner loop which we remember as
5962      def for the reduction PHI generation.  */
5963   bool double_reduc = false;
5964   bool main_exit_p = LOOP_VINFO_IV_EXIT (loop_vinfo) == loop_exit;
5965   stmt_vec_info rdef_info = stmt_info;
5966   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5967     {
5968       gcc_assert (!slp_node);
5969       double_reduc = true;
5970       stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5971                                             (stmt_info->stmt, 0));
5972       stmt_info = vect_stmt_to_vectorize (stmt_info);
5973     }
5974   gphi *reduc_def_stmt
5975     = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
5976   code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5977   internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
5978   tree vectype;
5979   machine_mode mode;
5980   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
5981   basic_block exit_bb;
5982   tree scalar_dest;
5983   tree scalar_type;
5984   gimple *new_phi = NULL, *phi = NULL;
5985   gimple_stmt_iterator exit_gsi;
5986   tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5987   gimple *epilog_stmt = NULL;
5988   gimple *exit_phi;
5989   tree bitsize;
5990   tree def;
5991   tree orig_name, scalar_result;
5992   imm_use_iterator imm_iter, phi_imm_iter;
5993   use_operand_p use_p, phi_use_p;
5994   gimple *use_stmt;
5995   auto_vec<tree> reduc_inputs;
5996   int j, i;
5997   vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
5998   unsigned int group_size = 1, k;
5999   auto_vec<gimple *> phis;
6000   /* SLP reduction without reduction chain, e.g.,
6001      # a1 = phi <a2, a0>
6002      # b1 = phi <b2, b0>
6003      a2 = operation (a1)
6004      b2 = operation (b1)  */
6005   bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
6006   bool direct_slp_reduc;
6007   tree induction_index = NULL_TREE;
6008
6009   if (slp_node)
6010     group_size = SLP_TREE_LANES (slp_node);
6011
6012   if (nested_in_vect_loop_p (loop, stmt_info))
6013     {
6014       outer_loop = loop;
6015       loop = loop->inner;
6016       gcc_assert (!slp_node && double_reduc);
6017     }
6018
6019   vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
6020   gcc_assert (vectype);
6021   mode = TYPE_MODE (vectype);
6022
6023   tree induc_val = NULL_TREE;
6024   tree adjustment_def = NULL;
6025   if (slp_node)
6026     ;
6027   else
6028     {
6029       /* Optimize: for induction condition reduction, if we can't use zero
6030          for induc_val, use initial_def.  */
6031       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6032         induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
6033       else if (double_reduc)
6034         ;
6035       else
6036         adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
6037     }
6038
6039   stmt_vec_info single_live_out_stmt[] = { stmt_info };
6040   array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
6041   if (slp_reduc)
6042     /* All statements produce live-out values.  */
6043     live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
6044   else if (slp_node)
6045     {
6046       /* The last statement in the reduction chain produces the live-out
6047          value.  Note SLP optimization can shuffle scalar stmts to
6048          optimize permutations so we have to search for the last stmt.  */
6049       for (k = 0; k < group_size; ++k)
6050         if (!REDUC_GROUP_NEXT_ELEMENT (SLP_TREE_SCALAR_STMTS (slp_node)[k]))
6051           {
6052             single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[k];
6053             break;
6054           }
6055     }
6056
6057   unsigned vec_num;
6058   int ncopies;
6059   if (slp_node)
6060     {
6061       vec_num = SLP_TREE_VEC_DEFS (slp_node_instance->reduc_phis).length ();
6062       ncopies = 1;
6063     }
6064   else
6065     {
6066       stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
6067       vec_num = 1;
6068       ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
6069     }
6070
6071   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
6072      which is updated with the current index of the loop for every match of
6073      the original loop's cond_expr (VEC_STMT).  This results in a vector
6074      containing the last time the condition passed for that vector lane.
6075      The first match will be a 1 to allow 0 to be used for non-matching
6076      indexes.  If there are no matches at all then the vector will be all
6077      zeroes.
6078
6079      PR92772: This algorithm is broken for architectures that support
6080      masked vectors, but do not provide fold_extract_last.  */
6081   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
6082     {
6083       auto_vec<std::pair<tree, bool>, 2> ccompares;
6084       stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
6085       cond_info = vect_stmt_to_vectorize (cond_info);
6086       while (cond_info != reduc_info)
6087         {
6088           if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
6089             {
6090               gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
6091               gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
6092               ccompares.safe_push
6093                 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
6094                                  STMT_VINFO_REDUC_IDX (cond_info) == 2));
6095             }
6096           cond_info
6097             = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
6098                                                  1 + STMT_VINFO_REDUC_IDX
6099                                                         (cond_info)));
6100           cond_info = vect_stmt_to_vectorize (cond_info);
6101         }
6102       gcc_assert (ccompares.length () != 0);
6103
6104       tree indx_before_incr, indx_after_incr;
6105       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
6106       int scalar_precision
6107         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
6108       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
6109       tree cr_index_vector_type = get_related_vectype_for_scalar_type
6110         (TYPE_MODE (vectype), cr_index_scalar_type,
6111          TYPE_VECTOR_SUBPARTS (vectype));
6112
6113       /* First we create a simple vector induction variable which starts
6114          with the values {1,2,3,...} (SERIES_VECT) and increments by the
6115          vector size (STEP).  */
6116
6117       /* Create a {1,2,3,...} vector.  */
6118       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
6119
6120       /* Create a vector of the step value.  */
6121       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
6122       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
6123
6124       /* Create an induction variable.  */
6125       gimple_stmt_iterator incr_gsi;
6126       bool insert_after;
6127       vect_iv_increment_position (loop_exit, &incr_gsi, &insert_after);
6128       create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, &incr_gsi,
6129                  insert_after, &indx_before_incr, &indx_after_incr);
6130
6131       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
6132          filled with zeros (VEC_ZERO).  */
6133
6134       /* Create a vector of 0s.  */
6135       tree zero = build_zero_cst (cr_index_scalar_type);
6136       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
6137
6138       /* Create a vector phi node.  */
6139       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
6140       new_phi = create_phi_node (new_phi_tree, loop->header);
6141       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
6142                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
6143
6144       /* Now take the condition from the loops original cond_exprs
6145          and produce a new cond_exprs (INDEX_COND_EXPR) which for
6146          every match uses values from the induction variable
6147          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
6148          (NEW_PHI_TREE).
6149          Finally, we update the phi (NEW_PHI_TREE) to take the value of
6150          the new cond_expr (INDEX_COND_EXPR).  */
6151       gimple_seq stmts = NULL;
6152       for (int i = ccompares.length () - 1; i != -1; --i)
6153         {
6154           tree ccompare = ccompares[i].first;
6155           if (ccompares[i].second)
6156             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
6157                                          cr_index_vector_type,
6158                                          ccompare,
6159                                          indx_before_incr, new_phi_tree);
6160           else
6161             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
6162                                          cr_index_vector_type,
6163                                          ccompare,
6164                                          new_phi_tree, indx_before_incr);
6165         }
6166       gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
6167
6168       /* Update the phi with the vec cond.  */
6169       induction_index = new_phi_tree;
6170       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
6171                    loop_latch_edge (loop), UNKNOWN_LOCATION);
6172     }
6173
6174   /* 2. Create epilog code.
6175         The reduction epilog code operates across the elements of the vector
6176         of partial results computed by the vectorized loop.
6177         The reduction epilog code consists of:
6178
6179         step 1: compute the scalar result in a vector (v_out2)
6180         step 2: extract the scalar result (s_out3) from the vector (v_out2)
6181         step 3: adjust the scalar result (s_out3) if needed.
6182
6183         Step 1 can be accomplished using one the following three schemes:
6184           (scheme 1) using reduc_fn, if available.
6185           (scheme 2) using whole-vector shifts, if available.
6186           (scheme 3) using a scalar loop. In this case steps 1+2 above are
6187                      combined.
6188
6189           The overall epilog code looks like this:
6190
6191           s_out0 = phi <s_loop>         # original EXIT_PHI
6192           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
6193           v_out2 = reduce <v_out1>              # step 1
6194           s_out3 = extract_field <v_out2, 0>    # step 2
6195           s_out4 = adjust_result <s_out3>       # step 3
6196
6197           (step 3 is optional, and steps 1 and 2 may be combined).
6198           Lastly, the uses of s_out0 are replaced by s_out4.  */
6199
6200
6201   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
6202          v_out1 = phi <VECT_DEF>
6203          Store them in NEW_PHIS.  */
6204   if (double_reduc)
6205     loop = outer_loop;
6206   /* We need to reduce values in all exits.  */
6207   exit_bb = loop_exit->dest;
6208   exit_gsi = gsi_after_labels (exit_bb);
6209   reduc_inputs.create (slp_node ? vec_num : ncopies);
6210   vec <gimple *> vec_stmts = vNULL;
6211   for (unsigned i = 0; i < vec_num; i++)
6212     {
6213       gimple_seq stmts = NULL;
6214       def = vect_get_vect_def (rdef_info, slp_node, slp_node_instance,
6215                                main_exit_p, i, vec_stmts);
6216       for (j = 0; j < ncopies; j++)
6217         {
6218           tree new_def = copy_ssa_name (def);
6219           phi = create_phi_node (new_def, exit_bb);
6220           if (j)
6221             def = gimple_get_lhs (vec_stmts[j]);
6222           SET_PHI_ARG_DEF (phi, loop_exit->dest_idx, def);
6223           new_def = gimple_convert (&stmts, vectype, new_def);
6224           reduc_inputs.quick_push (new_def);
6225         }
6226       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6227     }
6228
6229   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
6230          (i.e. when reduc_fn is not available) and in the final adjustment
6231          code (if needed).  Also get the original scalar reduction variable as
6232          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
6233          represents a reduction pattern), the tree-code and scalar-def are
6234          taken from the original stmt that the pattern-stmt (STMT) replaces.
6235          Otherwise (it is a regular reduction) - the tree-code and scalar-def
6236          are taken from STMT.  */
6237
6238   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6239   if (orig_stmt_info != stmt_info)
6240     {
6241       /* Reduction pattern  */
6242       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6243       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
6244     }
6245
6246   scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
6247   scalar_type = TREE_TYPE (scalar_dest);
6248   scalar_results.truncate (0);
6249   scalar_results.reserve_exact (group_size);
6250   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
6251   bitsize = TYPE_SIZE (scalar_type);
6252
6253   /* True if we should implement SLP_REDUC using native reduction operations
6254      instead of scalar operations.  */
6255   direct_slp_reduc = (reduc_fn != IFN_LAST
6256                       && slp_reduc
6257                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
6258
6259   /* In case of reduction chain, e.g.,
6260      # a1 = phi <a3, a0>
6261      a2 = operation (a1)
6262      a3 = operation (a2),
6263
6264      we may end up with more than one vector result.  Here we reduce them
6265      to one vector.
6266
6267      The same is true for a SLP reduction, e.g.,
6268      # a1 = phi <a2, a0>
6269      # b1 = phi <b2, b0>
6270      a2 = operation (a1)
6271      b2 = operation (a2),
6272
6273      where we can end up with more than one vector as well.  We can
6274      easily accumulate vectors when the number of vector elements is
6275      a multiple of the SLP group size.
6276
6277      The same is true if we couldn't use a single defuse cycle.  */
6278   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6279       || direct_slp_reduc
6280       || (slp_reduc
6281           && constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype), group_size))
6282       || ncopies > 1)
6283     {
6284       gimple_seq stmts = NULL;
6285       tree single_input = reduc_inputs[0];
6286       for (k = 1; k < reduc_inputs.length (); k++)
6287         single_input = gimple_build (&stmts, code, vectype,
6288                                      single_input, reduc_inputs[k]);
6289       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6290
6291       reduc_inputs.truncate (0);
6292       reduc_inputs.safe_push (single_input);
6293     }
6294
6295   tree orig_reduc_input = reduc_inputs[0];
6296
6297   /* If this loop is an epilogue loop that can be skipped after the
6298      main loop, we can only share a reduction operation between the
6299      main loop and the epilogue if we put it at the target of the
6300      skip edge.
6301
6302      We can still reuse accumulators if this check fails.  Doing so has
6303      the minor(?) benefit of making the epilogue loop's scalar result
6304      independent of the main loop's scalar result.  */
6305   bool unify_with_main_loop_p = false;
6306   if (reduc_info->reused_accumulator
6307       && loop_vinfo->skip_this_loop_edge
6308       && single_succ_p (exit_bb)
6309       && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
6310     {
6311       unify_with_main_loop_p = true;
6312
6313       basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
6314       reduc_inputs[0] = make_ssa_name (vectype);
6315       gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
6316       add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
6317                    UNKNOWN_LOCATION);
6318       add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
6319                    loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
6320       exit_gsi = gsi_after_labels (reduc_block);
6321     }
6322
6323   /* Shouldn't be used beyond this point.  */
6324   exit_bb = nullptr;
6325
6326   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6327       && reduc_fn != IFN_LAST)
6328     {
6329       /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
6330          various data values where the condition matched and another vector
6331          (INDUCTION_INDEX) containing all the indexes of those matches.  We
6332          need to extract the last matching index (which will be the index with
6333          highest value) and use this to index into the data vector.
6334          For the case where there were no matches, the data vector will contain
6335          all default values and the index vector will be all zeros.  */
6336
6337       /* Get various versions of the type of the vector of indexes.  */
6338       tree index_vec_type = TREE_TYPE (induction_index);
6339       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
6340       tree index_scalar_type = TREE_TYPE (index_vec_type);
6341       tree index_vec_cmp_type = truth_type_for (index_vec_type);
6342
6343       /* Get an unsigned integer version of the type of the data vector.  */
6344       int scalar_precision
6345         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6346       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
6347       tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
6348                                                 vectype);
6349
6350       /* First we need to create a vector (ZERO_VEC) of zeros and another
6351          vector (MAX_INDEX_VEC) filled with the last matching index, which we
6352          can create using a MAX reduction and then expanding.
6353          In the case where the loop never made any matches, the max index will
6354          be zero.  */
6355
6356       /* Vector of {0, 0, 0,...}.  */
6357       tree zero_vec = build_zero_cst (vectype);
6358
6359       /* Find maximum value from the vector of found indexes.  */
6360       tree max_index = make_ssa_name (index_scalar_type);
6361       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6362                                                           1, induction_index);
6363       gimple_call_set_lhs (max_index_stmt, max_index);
6364       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
6365
6366       /* Vector of {max_index, max_index, max_index,...}.  */
6367       tree max_index_vec = make_ssa_name (index_vec_type);
6368       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
6369                                                       max_index);
6370       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
6371                                                         max_index_vec_rhs);
6372       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
6373
6374       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
6375          with the vector (INDUCTION_INDEX) of found indexes, choosing values
6376          from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
6377          otherwise.  Only one value should match, resulting in a vector
6378          (VEC_COND) with one data value and the rest zeros.
6379          In the case where the loop never made any matches, every index will
6380          match, resulting in a vector with all data values (which will all be
6381          the default value).  */
6382
6383       /* Compare the max index vector to the vector of found indexes to find
6384          the position of the max value.  */
6385       tree vec_compare = make_ssa_name (index_vec_cmp_type);
6386       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
6387                                                       induction_index,
6388                                                       max_index_vec);
6389       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
6390
6391       /* Use the compare to choose either values from the data vector or
6392          zero.  */
6393       tree vec_cond = make_ssa_name (vectype);
6394       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
6395                                                    vec_compare,
6396                                                    reduc_inputs[0],
6397                                                    zero_vec);
6398       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
6399
6400       /* Finally we need to extract the data value from the vector (VEC_COND)
6401          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
6402          reduction, but because this doesn't exist, we can use a MAX reduction
6403          instead.  The data value might be signed or a float so we need to cast
6404          it first.
6405          In the case where the loop never made any matches, the data values are
6406          all identical, and so will reduce down correctly.  */
6407
6408       /* Make the matched data values unsigned.  */
6409       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
6410       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
6411                                        vec_cond);
6412       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
6413                                                         VIEW_CONVERT_EXPR,
6414                                                         vec_cond_cast_rhs);
6415       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
6416
6417       /* Reduce down to a scalar value.  */
6418       tree data_reduc = make_ssa_name (scalar_type_unsigned);
6419       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6420                                                            1, vec_cond_cast);
6421       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
6422       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
6423
6424       /* Convert the reduced value back to the result type and set as the
6425          result.  */
6426       gimple_seq stmts = NULL;
6427       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
6428                                data_reduc);
6429       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6430       scalar_results.safe_push (new_temp);
6431     }
6432   else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6433            && reduc_fn == IFN_LAST)
6434     {
6435       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
6436          idx = 0;
6437          idx_val = induction_index[0];
6438          val = data_reduc[0];
6439          for (idx = 0, val = init, i = 0; i < nelts; ++i)
6440            if (induction_index[i] > idx_val)
6441              val = data_reduc[i], idx_val = induction_index[i];
6442          return val;  */
6443
6444       tree data_eltype = TREE_TYPE (vectype);
6445       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
6446       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
6447       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
6448       /* Enforced by vectorizable_reduction, which ensures we have target
6449          support before allowing a conditional reduction on variable-length
6450          vectors.  */
6451       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
6452       tree idx_val = NULL_TREE, val = NULL_TREE;
6453       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
6454         {
6455           tree old_idx_val = idx_val;
6456           tree old_val = val;
6457           idx_val = make_ssa_name (idx_eltype);
6458           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
6459                                              build3 (BIT_FIELD_REF, idx_eltype,
6460                                                      induction_index,
6461                                                      bitsize_int (el_size),
6462                                                      bitsize_int (off)));
6463           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6464           val = make_ssa_name (data_eltype);
6465           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
6466                                              build3 (BIT_FIELD_REF,
6467                                                      data_eltype,
6468                                                      reduc_inputs[0],
6469                                                      bitsize_int (el_size),
6470                                                      bitsize_int (off)));
6471           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6472           if (off != 0)
6473             {
6474               tree new_idx_val = idx_val;
6475               if (off != v_size - el_size)
6476                 {
6477                   new_idx_val = make_ssa_name (idx_eltype);
6478                   epilog_stmt = gimple_build_assign (new_idx_val,
6479                                                      MAX_EXPR, idx_val,
6480                                                      old_idx_val);
6481                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6482                 }
6483               tree cond = make_ssa_name (boolean_type_node);
6484               epilog_stmt = gimple_build_assign (cond, GT_EXPR,
6485                                                  idx_val, old_idx_val);
6486               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6487               tree new_val = make_ssa_name (data_eltype);
6488               epilog_stmt = gimple_build_assign (new_val, COND_EXPR,
6489                                                  cond, val, old_val);
6490               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6491               idx_val = new_idx_val;
6492               val = new_val;
6493             }
6494         }
6495       /* Convert the reduced value back to the result type and set as the
6496          result.  */
6497       gimple_seq stmts = NULL;
6498       val = gimple_convert (&stmts, scalar_type, val);
6499       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6500       scalar_results.safe_push (val);
6501     }
6502
6503   /* 2.3 Create the reduction code, using one of the three schemes described
6504          above. In SLP we simply need to extract all the elements from the
6505          vector (without reducing them), so we use scalar shifts.  */
6506   else if (reduc_fn != IFN_LAST && !slp_reduc)
6507     {
6508       tree tmp;
6509       tree vec_elem_type;
6510
6511       /* Case 1:  Create:
6512          v_out2 = reduc_expr <v_out1>  */
6513
6514       if (dump_enabled_p ())
6515         dump_printf_loc (MSG_NOTE, vect_location,
6516                          "Reduce using direct vector reduction.\n");
6517
6518       gimple_seq stmts = NULL;
6519       vec_elem_type = TREE_TYPE (vectype);
6520       new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
6521                                vec_elem_type, reduc_inputs[0]);
6522       new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6523       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6524
6525       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6526           && induc_val)
6527         {
6528           /* Earlier we set the initial value to be a vector if induc_val
6529              values.  Check the result and if it is induc_val then replace
6530              with the original initial value, unless induc_val is
6531              the same as initial_def already.  */
6532           tree zcompare = make_ssa_name (boolean_type_node);
6533           epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
6534                                              new_temp, induc_val);
6535           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6536           tree initial_def = reduc_info->reduc_initial_values[0];
6537           tmp = make_ssa_name (new_scalar_dest);
6538           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6539                                              initial_def, new_temp);
6540           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6541           new_temp = tmp;
6542         }
6543
6544       scalar_results.safe_push (new_temp);
6545     }
6546   else if (direct_slp_reduc)
6547     {
6548       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
6549          with the elements for other SLP statements replaced with the
6550          neutral value.  We can then do a normal reduction on each vector.  */
6551
6552       /* Enforced by vectorizable_reduction.  */
6553       gcc_assert (reduc_inputs.length () == 1);
6554       gcc_assert (pow2p_hwi (group_size));
6555
6556       gimple_seq seq = NULL;
6557
6558       /* Build a vector {0, 1, 2, ...}, with the same number of elements
6559          and the same element size as VECTYPE.  */
6560       tree index = build_index_vector (vectype, 0, 1);
6561       tree index_type = TREE_TYPE (index);
6562       tree index_elt_type = TREE_TYPE (index_type);
6563       tree mask_type = truth_type_for (index_type);
6564
6565       /* Create a vector that, for each element, identifies which of
6566          the REDUC_GROUP_SIZE results should use it.  */
6567       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
6568       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
6569                             build_vector_from_val (index_type, index_mask));
6570
6571       /* Get a neutral vector value.  This is simply a splat of the neutral
6572          scalar value if we have one, otherwise the initial scalar value
6573          is itself a neutral value.  */
6574       tree vector_identity = NULL_TREE;
6575       tree neutral_op = NULL_TREE;
6576       if (slp_node)
6577         {
6578           tree initial_value = NULL_TREE;
6579           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6580             initial_value = reduc_info->reduc_initial_values[0];
6581           neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
6582                                                  initial_value, false);
6583         }
6584       if (neutral_op)
6585         vector_identity = gimple_build_vector_from_val (&seq, vectype,
6586                                                         neutral_op);
6587       for (unsigned int i = 0; i < group_size; ++i)
6588         {
6589           /* If there's no univeral neutral value, we can use the
6590              initial scalar value from the original PHI.  This is used
6591              for MIN and MAX reduction, for example.  */
6592           if (!neutral_op)
6593             {
6594               tree scalar_value = reduc_info->reduc_initial_values[i];
6595               scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
6596                                              scalar_value);
6597               vector_identity = gimple_build_vector_from_val (&seq, vectype,
6598                                                               scalar_value);
6599             }
6600
6601           /* Calculate the equivalent of:
6602
6603              sel[j] = (index[j] == i);
6604
6605              which selects the elements of REDUC_INPUTS[0] that should
6606              be included in the result.  */
6607           tree compare_val = build_int_cst (index_elt_type, i);
6608           compare_val = build_vector_from_val (index_type, compare_val);
6609           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
6610                                    index, compare_val);
6611
6612           /* Calculate the equivalent of:
6613
6614              vec = seq ? reduc_inputs[0] : vector_identity;
6615
6616              VEC is now suitable for a full vector reduction.  */
6617           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
6618                                    sel, reduc_inputs[0], vector_identity);
6619
6620           /* Do the reduction and convert it to the appropriate type.  */
6621           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
6622                                       TREE_TYPE (vectype), vec);
6623           scalar = gimple_convert (&seq, scalar_type, scalar);
6624           scalar_results.safe_push (scalar);
6625         }
6626       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
6627     }
6628   else
6629     {
6630       bool reduce_with_shift;
6631       tree vec_temp;
6632
6633       gcc_assert (slp_reduc || reduc_inputs.length () == 1);
6634
6635       /* See if the target wants to do the final (shift) reduction
6636          in a vector mode of smaller size and first reduce upper/lower
6637          halves against each other.  */
6638       enum machine_mode mode1 = mode;
6639       tree stype = TREE_TYPE (vectype);
6640       unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6641       unsigned nunits1 = nunits;
6642       if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
6643           && reduc_inputs.length () == 1)
6644         {
6645           nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6646           /* For SLP reductions we have to make sure lanes match up, but
6647              since we're doing individual element final reduction reducing
6648              vector width here is even more important.
6649              ???  We can also separate lanes with permutes, for the common
6650              case of power-of-two group-size odd/even extracts would work.  */
6651           if (slp_reduc && nunits != nunits1)
6652             {
6653               nunits1 = least_common_multiple (nunits1, group_size);
6654               gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
6655             }
6656         }
6657       if (!slp_reduc
6658           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
6659         nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6660
6661       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
6662                                                            stype, nunits1);
6663       reduce_with_shift = have_whole_vector_shift (mode1);
6664       if (!VECTOR_MODE_P (mode1)
6665           || !directly_supported_p (code, vectype1))
6666         reduce_with_shift = false;
6667
6668       /* First reduce the vector to the desired vector size we should
6669          do shift reduction on by combining upper and lower halves.  */
6670       gimple_seq stmts = NULL;
6671       new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
6672                                              code, &stmts);
6673       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6674       reduc_inputs[0] = new_temp;
6675
6676       if (reduce_with_shift && !slp_reduc)
6677         {
6678           int element_bitsize = tree_to_uhwi (bitsize);
6679           /* Enforced by vectorizable_reduction, which disallows SLP reductions
6680              for variable-length vectors and also requires direct target support
6681              for loop reductions.  */
6682           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6683           int nelements = vec_size_in_bits / element_bitsize;
6684           vec_perm_builder sel;
6685           vec_perm_indices indices;
6686
6687           int elt_offset;
6688
6689           tree zero_vec = build_zero_cst (vectype1);
6690           /* Case 2: Create:
6691              for (offset = nelements/2; offset >= 1; offset/=2)
6692                 {
6693                   Create:  va' = vec_shift <va, offset>
6694                   Create:  va = vop <va, va'>
6695                 }  */
6696
6697           tree rhs;
6698
6699           if (dump_enabled_p ())
6700             dump_printf_loc (MSG_NOTE, vect_location,
6701                              "Reduce using vector shifts\n");
6702
6703           gimple_seq stmts = NULL;
6704           new_temp = gimple_convert (&stmts, vectype1, new_temp);
6705           for (elt_offset = nelements / 2;
6706                elt_offset >= 1;
6707                elt_offset /= 2)
6708             {
6709               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
6710               indices.new_vector (sel, 2, nelements);
6711               tree mask = vect_gen_perm_mask_any (vectype1, indices);
6712               new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
6713                                        new_temp, zero_vec, mask);
6714               new_temp = gimple_build (&stmts, code,
6715                                        vectype1, new_name, new_temp);
6716             }
6717           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6718
6719           /* 2.4  Extract the final scalar result.  Create:
6720              s_out3 = extract_field <v_out2, bitpos>  */
6721
6722           if (dump_enabled_p ())
6723             dump_printf_loc (MSG_NOTE, vect_location,
6724                              "extract scalar result\n");
6725
6726           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
6727                         bitsize, bitsize_zero_node);
6728           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
6729           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
6730           gimple_assign_set_lhs (epilog_stmt, new_temp);
6731           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6732           scalar_results.safe_push (new_temp);
6733         }
6734       else
6735         {
6736           /* Case 3: Create:
6737              s = extract_field <v_out2, 0>
6738              for (offset = element_size;
6739                   offset < vector_size;
6740                   offset += element_size;)
6741                {
6742                  Create:  s' = extract_field <v_out2, offset>
6743                  Create:  s = op <s, s'>  // For non SLP cases
6744                }  */
6745
6746           if (dump_enabled_p ())
6747             dump_printf_loc (MSG_NOTE, vect_location,
6748                              "Reduce using scalar code.\n");
6749
6750           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6751           int element_bitsize = tree_to_uhwi (bitsize);
6752           tree compute_type = TREE_TYPE (vectype);
6753           gimple_seq stmts = NULL;
6754           FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
6755             {
6756               int bit_offset;
6757               new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
6758                                        vec_temp, bitsize, bitsize_zero_node);
6759
6760               /* In SLP we don't need to apply reduction operation, so we just
6761                  collect s' values in SCALAR_RESULTS.  */
6762               if (slp_reduc)
6763                 scalar_results.safe_push (new_temp);
6764
6765               for (bit_offset = element_bitsize;
6766                    bit_offset < vec_size_in_bits;
6767                    bit_offset += element_bitsize)
6768                 {
6769                   tree bitpos = bitsize_int (bit_offset);
6770                   new_name = gimple_build (&stmts, BIT_FIELD_REF,
6771                                            compute_type, vec_temp,
6772                                            bitsize, bitpos);
6773                   if (slp_reduc)
6774                     {
6775                       /* In SLP we don't need to apply reduction operation, so
6776                          we just collect s' values in SCALAR_RESULTS.  */
6777                       new_temp = new_name;
6778                       scalar_results.safe_push (new_name);
6779                     }
6780                   else
6781                     new_temp = gimple_build (&stmts, code, compute_type,
6782                                              new_name, new_temp);
6783                 }
6784             }
6785
6786           /* The only case where we need to reduce scalar results in SLP, is
6787              unrolling.  If the size of SCALAR_RESULTS is greater than
6788              REDUC_GROUP_SIZE, we reduce them combining elements modulo
6789              REDUC_GROUP_SIZE.  */
6790           if (slp_reduc)
6791             {
6792               tree res, first_res, new_res;
6793
6794               /* Reduce multiple scalar results in case of SLP unrolling.  */
6795               for (j = group_size; scalar_results.iterate (j, &res);
6796                    j++)
6797                 {
6798                   first_res = scalar_results[j % group_size];
6799                   new_res = gimple_build (&stmts, code, compute_type,
6800                                           first_res, res);
6801                   scalar_results[j % group_size] = new_res;
6802                 }
6803               scalar_results.truncate (group_size);
6804               for (k = 0; k < group_size; k++)
6805                 scalar_results[k] = gimple_convert (&stmts, scalar_type,
6806                                                     scalar_results[k]);
6807             }
6808           else
6809             {
6810               /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
6811               new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6812               scalar_results.safe_push (new_temp);
6813             }
6814
6815           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6816         }
6817
6818       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6819           && induc_val)
6820         {
6821           /* Earlier we set the initial value to be a vector if induc_val
6822              values.  Check the result and if it is induc_val then replace
6823              with the original initial value, unless induc_val is
6824              the same as initial_def already.  */
6825           tree zcompare = make_ssa_name (boolean_type_node);
6826           epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR, new_temp,
6827                                              induc_val);
6828           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6829           tree initial_def = reduc_info->reduc_initial_values[0];
6830           tree tmp = make_ssa_name (new_scalar_dest);
6831           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6832                                              initial_def, new_temp);
6833           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6834           scalar_results[0] = tmp;
6835         }
6836     }
6837
6838   /* 2.5 Adjust the final result by the initial value of the reduction
6839          variable. (When such adjustment is not needed, then
6840          'adjustment_def' is zero).  For example, if code is PLUS we create:
6841          new_temp = loop_exit_def + adjustment_def  */
6842
6843   if (adjustment_def)
6844     {
6845       gcc_assert (!slp_reduc);
6846       gimple_seq stmts = NULL;
6847       if (double_reduc)
6848         {
6849           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6850           adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6851           new_temp = gimple_build (&stmts, code, vectype,
6852                                    reduc_inputs[0], adjustment_def);
6853         }
6854       else
6855         {
6856           new_temp = scalar_results[0];
6857           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6858           adjustment_def = gimple_convert (&stmts, TREE_TYPE (vectype),
6859                                            adjustment_def);
6860           new_temp = gimple_convert (&stmts, TREE_TYPE (vectype), new_temp);
6861           new_temp = gimple_build (&stmts, code, TREE_TYPE (vectype),
6862                                    new_temp, adjustment_def);
6863           new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6864         }
6865
6866       epilog_stmt = gimple_seq_last_stmt (stmts);
6867       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6868       scalar_results[0] = new_temp;
6869     }
6870
6871   /* Record this operation if it could be reused by the epilogue loop.  */
6872   if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION
6873       && reduc_inputs.length () == 1)
6874     loop_vinfo->reusable_accumulators.put (scalar_results[0],
6875                                            { orig_reduc_input, reduc_info });
6876
6877   if (double_reduc)
6878     loop = outer_loop;
6879
6880   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
6881           phis with new adjusted scalar results, i.e., replace use <s_out0>
6882           with use <s_out4>.
6883
6884      Transform:
6885         loop_exit:
6886           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6887           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6888           v_out2 = reduce <v_out1>
6889           s_out3 = extract_field <v_out2, 0>
6890           s_out4 = adjust_result <s_out3>
6891           use <s_out0>
6892           use <s_out0>
6893
6894      into:
6895
6896         loop_exit:
6897           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6898           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6899           v_out2 = reduce <v_out1>
6900           s_out3 = extract_field <v_out2, 0>
6901           s_out4 = adjust_result <s_out3>
6902           use <s_out4>
6903           use <s_out4> */
6904
6905   gcc_assert (live_out_stmts.size () == scalar_results.length ());
6906   for (k = 0; k < live_out_stmts.size (); k++)
6907     {
6908       stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6909       scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6910
6911       phis.create (3);
6912       /* Find the loop-closed-use at the loop exit of the original scalar
6913          result.  (The reduction result is expected to have two immediate uses,
6914          one at the latch block, and one at the loop exit).  For double
6915          reductions we are looking for exit phis of the outer loop.  */
6916       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6917         {
6918           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6919             {
6920               if (!is_gimple_debug (USE_STMT (use_p)))
6921                 phis.safe_push (USE_STMT (use_p));
6922             }
6923           else
6924             {
6925               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6926                 {
6927                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
6928
6929                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6930                     {
6931                       if (!flow_bb_inside_loop_p (loop,
6932                                              gimple_bb (USE_STMT (phi_use_p)))
6933                           && !is_gimple_debug (USE_STMT (phi_use_p)))
6934                         phis.safe_push (USE_STMT (phi_use_p));
6935                     }
6936                 }
6937             }
6938         }
6939
6940       FOR_EACH_VEC_ELT (phis, i, exit_phi)
6941         {
6942           /* Replace the uses:  */
6943           orig_name = PHI_RESULT (exit_phi);
6944
6945           /* Look for a single use at the target of the skip edge.  */
6946           if (unify_with_main_loop_p)
6947             {
6948               use_operand_p use_p;
6949               gimple *user;
6950               if (!single_imm_use (orig_name, &use_p, &user))
6951                 gcc_unreachable ();
6952               orig_name = gimple_get_lhs (user);
6953             }
6954
6955           scalar_result = scalar_results[k];
6956           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6957             {
6958               FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6959                 SET_USE (use_p, scalar_result);
6960               update_stmt (use_stmt);
6961             }
6962         }
6963
6964       phis.release ();
6965     }
6966 }
6967
6968 /* Return a vector of type VECTYPE that is equal to the vector select
6969    operation "MASK ? VEC : IDENTITY".  Insert the select statements
6970    before GSI.  */
6971
6972 static tree
6973 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6974                      tree vec, tree identity)
6975 {
6976   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6977   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6978                                           mask, vec, identity);
6979   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6980   return cond;
6981 }
6982
6983 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6984    order, starting with LHS.  Insert the extraction statements before GSI and
6985    associate the new scalar SSA names with variable SCALAR_DEST.
6986    If MASK is nonzero mask the input and then operate on it unconditionally.
6987    Return the SSA name for the result.  */
6988
6989 static tree
6990 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6991                        tree_code code, tree lhs, tree vector_rhs,
6992                        tree mask)
6993 {
6994   tree vectype = TREE_TYPE (vector_rhs);
6995   tree scalar_type = TREE_TYPE (vectype);
6996   tree bitsize = TYPE_SIZE (scalar_type);
6997   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6998   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6999
7000   /* Re-create a VEC_COND_EXPR to mask the input here in order to be able
7001      to perform an unconditional element-wise reduction of it.  */
7002   if (mask)
7003     {
7004       tree masked_vector_rhs = make_temp_ssa_name (vectype, NULL,
7005                                                    "masked_vector_rhs");
7006       tree neutral_op = neutral_op_for_reduction (scalar_type, code, NULL_TREE,
7007                                                   false);
7008       tree vector_identity = build_vector_from_val (vectype, neutral_op);
7009       gassign *select = gimple_build_assign (masked_vector_rhs, VEC_COND_EXPR,
7010                                              mask, vector_rhs, vector_identity);
7011       gsi_insert_before (gsi, select, GSI_SAME_STMT);
7012       vector_rhs = masked_vector_rhs;
7013     }
7014
7015   for (unsigned HOST_WIDE_INT bit_offset = 0;
7016        bit_offset < vec_size_in_bits;
7017        bit_offset += element_bitsize)
7018     {
7019       tree bitpos = bitsize_int (bit_offset);
7020       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
7021                          bitsize, bitpos);
7022
7023       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
7024       rhs = make_ssa_name (scalar_dest, stmt);
7025       gimple_assign_set_lhs (stmt, rhs);
7026       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
7027
7028       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
7029       tree new_name = make_ssa_name (scalar_dest, stmt);
7030       gimple_assign_set_lhs (stmt, new_name);
7031       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
7032       lhs = new_name;
7033     }
7034   return lhs;
7035 }
7036
7037 /* Get a masked internal function equivalent to REDUC_FN.  VECTYPE_IN is the
7038    type of the vector input.  */
7039
7040 static internal_fn
7041 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
7042 {
7043   internal_fn mask_reduc_fn;
7044   internal_fn mask_len_reduc_fn;
7045
7046   switch (reduc_fn)
7047     {
7048     case IFN_FOLD_LEFT_PLUS:
7049       mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
7050       mask_len_reduc_fn = IFN_MASK_LEN_FOLD_LEFT_PLUS;
7051       break;
7052
7053     default:
7054       return IFN_LAST;
7055     }
7056
7057   if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
7058                                       OPTIMIZE_FOR_SPEED))
7059     return mask_reduc_fn;
7060   if (direct_internal_fn_supported_p (mask_len_reduc_fn, vectype_in,
7061                                       OPTIMIZE_FOR_SPEED))
7062     return mask_len_reduc_fn;
7063   return IFN_LAST;
7064 }
7065
7066 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
7067    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
7068    statement.  CODE is the operation performed by STMT_INFO and OPS are
7069    its scalar operands.  REDUC_INDEX is the index of the operand in
7070    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
7071    implements in-order reduction, or IFN_LAST if we should open-code it.
7072    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
7073    that should be used to control the operation in a fully-masked loop.  */
7074
7075 static bool
7076 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
7077                                stmt_vec_info stmt_info,
7078                                gimple_stmt_iterator *gsi,
7079                                gimple **vec_stmt, slp_tree slp_node,
7080                                gimple *reduc_def_stmt,
7081                                code_helper code, internal_fn reduc_fn,
7082                                tree *ops, int num_ops, tree vectype_in,
7083                                int reduc_index, vec_loop_masks *masks,
7084                                vec_loop_lens *lens)
7085 {
7086   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7087   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7088   internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
7089
7090   int ncopies;
7091   if (slp_node)
7092     ncopies = 1;
7093   else
7094     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7095
7096   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
7097   gcc_assert (ncopies == 1);
7098
7099   bool is_cond_op = false;
7100   if (!code.is_tree_code ())
7101     {
7102       code = conditional_internal_fn_code (internal_fn (code));
7103       gcc_assert (code != ERROR_MARK);
7104       is_cond_op = true;
7105     }
7106
7107   gcc_assert (TREE_CODE_LENGTH (tree_code (code)) == binary_op);
7108
7109   if (slp_node)
7110     {
7111       if (is_cond_op)
7112         {
7113           if (dump_enabled_p ())
7114             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7115                              "fold-left reduction on SLP not supported.\n");
7116           return false;
7117         }
7118
7119       gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
7120                             TYPE_VECTOR_SUBPARTS (vectype_in)));
7121     }
7122
7123   /* The operands either come from a binary operation or an IFN_COND operation.
7124      The former is a gimple assign with binary rhs and the latter is a
7125      gimple call with four arguments.  */
7126   gcc_assert (num_ops == 2 || num_ops == 4);
7127   tree op0, opmask;
7128   if (!is_cond_op)
7129     op0 = ops[1 - reduc_index];
7130   else
7131     {
7132       op0 = ops[2 + (1 - reduc_index)];
7133       opmask = ops[0];
7134       gcc_assert (!slp_node);
7135     }
7136
7137   int group_size = 1;
7138   stmt_vec_info scalar_dest_def_info;
7139   auto_vec<tree> vec_oprnds0, vec_opmask;
7140   if (slp_node)
7141     {
7142       auto_vec<vec<tree> > vec_defs (2);
7143       vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
7144       vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
7145       vec_defs[0].release ();
7146       vec_defs[1].release ();
7147       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7148       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
7149     }
7150   else
7151     {
7152       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7153                                      op0, &vec_oprnds0);
7154       scalar_dest_def_info = stmt_info;
7155
7156       /* For an IFN_COND_OP we also need the vector mask operand.  */
7157       if (is_cond_op)
7158           vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7159                                          opmask, &vec_opmask);
7160     }
7161
7162   gimple *sdef = vect_orig_stmt (scalar_dest_def_info)->stmt;
7163   tree scalar_dest = gimple_get_lhs (sdef);
7164   tree scalar_type = TREE_TYPE (scalar_dest);
7165   tree reduc_var = gimple_phi_result (reduc_def_stmt);
7166
7167   int vec_num = vec_oprnds0.length ();
7168   gcc_assert (vec_num == 1 || slp_node);
7169   tree vec_elem_type = TREE_TYPE (vectype_out);
7170   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
7171
7172   tree vector_identity = NULL_TREE;
7173   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7174     {
7175       vector_identity = build_zero_cst (vectype_out);
7176       if (!HONOR_SIGNED_ZEROS (vectype_out))
7177         ;
7178       else
7179         {
7180           gcc_assert (!HONOR_SIGN_DEPENDENT_ROUNDING (vectype_out));
7181           vector_identity = const_unop (NEGATE_EXPR, vectype_out,
7182                                         vector_identity);
7183         }
7184     }
7185
7186   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
7187   int i;
7188   tree def0;
7189   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7190     {
7191       gimple *new_stmt;
7192       tree mask = NULL_TREE;
7193       tree len = NULL_TREE;
7194       tree bias = NULL_TREE;
7195       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7196         mask = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num, vectype_in, i);
7197       else if (is_cond_op)
7198         mask = vec_opmask[0];
7199       if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
7200         {
7201           len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
7202                                    i, 1);
7203           signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
7204           bias = build_int_cst (intQI_type_node, biasval);
7205           if (!is_cond_op)
7206             mask = build_minus_one_cst (truth_type_for (vectype_in));
7207         }
7208
7209       /* Handle MINUS by adding the negative.  */
7210       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
7211         {
7212           tree negated = make_ssa_name (vectype_out);
7213           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
7214           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
7215           def0 = negated;
7216         }
7217
7218       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
7219           && mask && mask_reduc_fn == IFN_LAST)
7220         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
7221                                     vector_identity);
7222
7223       /* On the first iteration the input is simply the scalar phi
7224          result, and for subsequent iterations it is the output of
7225          the preceding operation.  */
7226       if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
7227         {
7228           if (mask && len && mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
7229             new_stmt = gimple_build_call_internal (mask_reduc_fn, 5, reduc_var,
7230                                                    def0, mask, len, bias);
7231           else if (mask && mask_reduc_fn == IFN_MASK_FOLD_LEFT_PLUS)
7232             new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
7233                                                    def0, mask);
7234           else
7235             new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
7236                                                    def0);
7237           /* For chained SLP reductions the output of the previous reduction
7238              operation serves as the input of the next. For the final statement
7239              the output cannot be a temporary - we reuse the original
7240              scalar destination of the last statement.  */
7241           if (i != vec_num - 1)
7242             {
7243               gimple_set_lhs (new_stmt, scalar_dest_var);
7244               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
7245               gimple_set_lhs (new_stmt, reduc_var);
7246             }
7247         }
7248       else
7249         {
7250           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var,
7251                                              tree_code (code), reduc_var, def0,
7252                                              mask);
7253           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
7254           /* Remove the statement, so that we can use the same code paths
7255              as for statements that we've just created.  */
7256           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
7257           gsi_remove (&tmp_gsi, true);
7258         }
7259
7260       if (i == vec_num - 1)
7261         {
7262           gimple_set_lhs (new_stmt, scalar_dest);
7263           vect_finish_replace_stmt (loop_vinfo,
7264                                     scalar_dest_def_info,
7265                                     new_stmt);
7266         }
7267       else
7268         vect_finish_stmt_generation (loop_vinfo,
7269                                      scalar_dest_def_info,
7270                                      new_stmt, gsi);
7271
7272       if (slp_node)
7273         slp_node->push_vec_def (new_stmt);
7274       else
7275         {
7276           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7277           *vec_stmt = new_stmt;
7278         }
7279     }
7280
7281   return true;
7282 }
7283
7284 /* Function is_nonwrapping_integer_induction.
7285
7286    Check if STMT_VINO (which is part of loop LOOP) both increments and
7287    does not cause overflow.  */
7288
7289 static bool
7290 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
7291 {
7292   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
7293   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
7294   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
7295   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
7296   widest_int ni, max_loop_value, lhs_max;
7297   wi::overflow_type overflow = wi::OVF_NONE;
7298
7299   /* Make sure the loop is integer based.  */
7300   if (TREE_CODE (base) != INTEGER_CST
7301       || TREE_CODE (step) != INTEGER_CST)
7302     return false;
7303
7304   /* Check that the max size of the loop will not wrap.  */
7305
7306   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
7307     return true;
7308
7309   if (! max_stmt_executions (loop, &ni))
7310     return false;
7311
7312   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
7313                             &overflow);
7314   if (overflow)
7315     return false;
7316
7317   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
7318                             TYPE_SIGN (lhs_type), &overflow);
7319   if (overflow)
7320     return false;
7321
7322   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
7323           <= TYPE_PRECISION (lhs_type));
7324 }
7325
7326 /* Check if masking can be supported by inserting a conditional expression.
7327    CODE is the code for the operation.  COND_FN is the conditional internal
7328    function, if it exists.  VECTYPE_IN is the type of the vector input.  */
7329 static bool
7330 use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
7331                          tree vectype_in)
7332 {
7333   if (cond_fn != IFN_LAST
7334       && direct_internal_fn_supported_p (cond_fn, vectype_in,
7335                                          OPTIMIZE_FOR_SPEED))
7336     return false;
7337
7338   if (code.is_tree_code ())
7339     switch (tree_code (code))
7340       {
7341       case DOT_PROD_EXPR:
7342       case SAD_EXPR:
7343         return true;
7344
7345       default:
7346         break;
7347       }
7348   return false;
7349 }
7350
7351 /* Insert a conditional expression to enable masked vectorization.  CODE is the
7352    code for the operation.  VOP is the array of operands.  MASK is the loop
7353    mask.  GSI is a statement iterator used to place the new conditional
7354    expression.  */
7355 static void
7356 build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
7357                       gimple_stmt_iterator *gsi)
7358 {
7359   switch (tree_code (code))
7360     {
7361     case DOT_PROD_EXPR:
7362       {
7363         tree vectype = TREE_TYPE (vop[1]);
7364         tree zero = build_zero_cst (vectype);
7365         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7366         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7367                                                mask, vop[1], zero);
7368         gsi_insert_before (gsi, select, GSI_SAME_STMT);
7369         vop[1] = masked_op1;
7370         break;
7371       }
7372
7373     case SAD_EXPR:
7374       {
7375         tree vectype = TREE_TYPE (vop[1]);
7376         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7377         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7378                                                mask, vop[1], vop[0]);
7379         gsi_insert_before (gsi, select, GSI_SAME_STMT);
7380         vop[1] = masked_op1;
7381         break;
7382       }
7383
7384     default:
7385       gcc_unreachable ();
7386     }
7387 }
7388
7389 /* Function vectorizable_reduction.
7390
7391    Check if STMT_INFO performs a reduction operation that can be vectorized.
7392    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
7393    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
7394    Return true if STMT_INFO is vectorizable in this way.
7395
7396    This function also handles reduction idioms (patterns) that have been
7397    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
7398    may be of this form:
7399      X = pattern_expr (arg0, arg1, ..., X)
7400    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
7401    sequence that had been detected and replaced by the pattern-stmt
7402    (STMT_INFO).
7403
7404    This function also handles reduction of condition expressions, for example:
7405      for (int i = 0; i < N; i++)
7406        if (a[i] < value)
7407          last = a[i];
7408    This is handled by vectorising the loop and creating an additional vector
7409    containing the loop indexes for which "a[i] < value" was true.  In the
7410    function epilogue this is reduced to a single max value and then used to
7411    index into the vector of results.
7412
7413    In some cases of reduction patterns, the type of the reduction variable X is
7414    different than the type of the other arguments of STMT_INFO.
7415    In such cases, the vectype that is used when transforming STMT_INFO into
7416    a vector stmt is different than the vectype that is used to determine the
7417    vectorization factor, because it consists of a different number of elements
7418    than the actual number of elements that are being operated upon in parallel.
7419
7420    For example, consider an accumulation of shorts into an int accumulator.
7421    On some targets it's possible to vectorize this pattern operating on 8
7422    shorts at a time (hence, the vectype for purposes of determining the
7423    vectorization factor should be V8HI); on the other hand, the vectype that
7424    is used to create the vector form is actually V4SI (the type of the result).
7425
7426    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
7427    indicates what is the actual level of parallelism (V8HI in the example), so
7428    that the right vectorization factor would be derived.  This vectype
7429    corresponds to the type of arguments to the reduction stmt, and should *NOT*
7430    be used to create the vectorized stmt.  The right vectype for the vectorized
7431    stmt is obtained from the type of the result X:
7432       get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7433
7434    This means that, contrary to "regular" reductions (or "regular" stmts in
7435    general), the following equation:
7436       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7437    does *NOT* necessarily hold for reduction patterns.  */
7438
7439 bool
7440 vectorizable_reduction (loop_vec_info loop_vinfo,
7441                         stmt_vec_info stmt_info, slp_tree slp_node,
7442                         slp_instance slp_node_instance,
7443                         stmt_vector_for_cost *cost_vec)
7444 {
7445   tree vectype_in = NULL_TREE;
7446   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7447   enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
7448   stmt_vec_info cond_stmt_vinfo = NULL;
7449   int i;
7450   int ncopies;
7451   bool single_defuse_cycle = false;
7452   bool nested_cycle = false;
7453   bool double_reduc = false;
7454   int vec_num;
7455   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
7456   tree cond_reduc_val = NULL_TREE;
7457
7458   /* Make sure it was already recognized as a reduction computation.  */
7459   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
7460       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
7461       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
7462     return false;
7463
7464   /* The stmt we store reduction analysis meta on.  */
7465   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7466   reduc_info->is_reduc_info = true;
7467
7468   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
7469     {
7470       if (is_a <gphi *> (stmt_info->stmt))
7471         {
7472           if (slp_node)
7473             {
7474               /* We eventually need to set a vector type on invariant
7475                  arguments.  */
7476               unsigned j;
7477               slp_tree child;
7478               FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
7479                 if (!vect_maybe_update_slp_op_vectype
7480                        (child, SLP_TREE_VECTYPE (slp_node)))
7481                   {
7482                     if (dump_enabled_p ())
7483                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7484                                        "incompatible vector types for "
7485                                        "invariants\n");
7486                     return false;
7487                   }
7488             }
7489           /* Analysis for double-reduction is done on the outer
7490              loop PHI, nested cycles have no further restrictions.  */
7491           STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
7492         }
7493       else
7494         STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7495       return true;
7496     }
7497
7498   stmt_vec_info orig_stmt_of_analysis = stmt_info;
7499   stmt_vec_info phi_info = stmt_info;
7500   if (!is_a <gphi *> (stmt_info->stmt))
7501     {
7502       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7503       return true;
7504     }
7505   if (slp_node)
7506     {
7507       slp_node_instance->reduc_phis = slp_node;
7508       /* ???  We're leaving slp_node to point to the PHIs, we only
7509          need it to get at the number of vector stmts which wasn't
7510          yet initialized for the instance root.  */
7511     }
7512   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7513     {
7514       use_operand_p use_p;
7515       gimple *use_stmt;
7516       bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
7517                                  &use_p, &use_stmt);
7518       gcc_assert (res);
7519       phi_info = loop_vinfo->lookup_stmt (use_stmt);
7520     }
7521
7522   /* PHIs should not participate in patterns.  */
7523   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7524   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7525
7526   /* Verify following REDUC_IDX from the latch def leads us back to the PHI
7527      and compute the reduction chain length.  Discover the real
7528      reduction operation stmt on the way (stmt_info and slp_for_stmt_info).  */
7529   tree reduc_def
7530     = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
7531                              loop_latch_edge
7532                                (gimple_bb (reduc_def_phi)->loop_father));
7533   unsigned reduc_chain_length = 0;
7534   bool only_slp_reduc_chain = true;
7535   stmt_info = NULL;
7536   slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
7537   while (reduc_def != PHI_RESULT (reduc_def_phi))
7538     {
7539       stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
7540       stmt_vec_info vdef = vect_stmt_to_vectorize (def);
7541       if (STMT_VINFO_REDUC_IDX (vdef) == -1)
7542         {
7543           if (dump_enabled_p ())
7544             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7545                              "reduction chain broken by patterns.\n");
7546           return false;
7547         }
7548       if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
7549         only_slp_reduc_chain = false;
7550       /* For epilogue generation live members of the chain need
7551          to point back to the PHI via their original stmt for
7552          info_for_reduction to work.  For SLP we need to look at
7553          all lanes here - even though we only will vectorize from
7554          the SLP node with live lane zero the other live lanes also
7555          need to be identified as part of a reduction to be able
7556          to skip code generation for them.  */
7557       if (slp_for_stmt_info)
7558         {
7559           for (auto s : SLP_TREE_SCALAR_STMTS (slp_for_stmt_info))
7560             if (STMT_VINFO_LIVE_P (s))
7561               STMT_VINFO_REDUC_DEF (vect_orig_stmt (s)) = phi_info;
7562         }
7563       else if (STMT_VINFO_LIVE_P (vdef))
7564         STMT_VINFO_REDUC_DEF (def) = phi_info;
7565       gimple_match_op op;
7566       if (!gimple_extract_op (vdef->stmt, &op))
7567         {
7568           if (dump_enabled_p ())
7569             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7570                              "reduction chain includes unsupported"
7571                              " statement type.\n");
7572           return false;
7573         }
7574       if (CONVERT_EXPR_CODE_P (op.code))
7575         {
7576           if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
7577             {
7578               if (dump_enabled_p ())
7579                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7580                                  "conversion in the reduction chain.\n");
7581               return false;
7582             }
7583         }
7584       else if (!stmt_info)
7585         /* First non-conversion stmt.  */
7586         stmt_info = vdef;
7587       reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
7588       reduc_chain_length++;
7589       if (!stmt_info && slp_node)
7590         slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
7591     }
7592   /* PHIs should not participate in patterns.  */
7593   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7594
7595   if (nested_in_vect_loop_p (loop, stmt_info))
7596     {
7597       loop = loop->inner;
7598       nested_cycle = true;
7599     }
7600
7601   /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
7602      element.  */
7603   if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7604     {
7605       gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
7606       stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
7607     }
7608   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7609     gcc_assert (slp_node
7610                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
7611
7612   /* 1. Is vectorizable reduction?  */
7613   /* Not supportable if the reduction variable is used in the loop, unless
7614      it's a reduction chain.  */
7615   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
7616       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7617     return false;
7618
7619   /* Reductions that are not used even in an enclosing outer-loop,
7620      are expected to be "live" (used out of the loop).  */
7621   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
7622       && !STMT_VINFO_LIVE_P (stmt_info))
7623     return false;
7624
7625   /* 2. Has this been recognized as a reduction pattern?
7626
7627      Check if STMT represents a pattern that has been recognized
7628      in earlier analysis stages.  For stmts that represent a pattern,
7629      the STMT_VINFO_RELATED_STMT field records the last stmt in
7630      the original sequence that constitutes the pattern.  */
7631
7632   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
7633   if (orig_stmt_info)
7634     {
7635       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
7636       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
7637     }
7638
7639   /* 3. Check the operands of the operation.  The first operands are defined
7640         inside the loop body. The last operand is the reduction variable,
7641         which is defined by the loop-header-phi.  */
7642
7643   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7644   STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
7645   gimple_match_op op;
7646   if (!gimple_extract_op (stmt_info->stmt, &op))
7647     gcc_unreachable ();
7648   bool lane_reduc_code_p = (op.code == DOT_PROD_EXPR
7649                             || op.code == WIDEN_SUM_EXPR
7650                             || op.code == SAD_EXPR);
7651
7652   if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
7653       && !SCALAR_FLOAT_TYPE_P (op.type))
7654     return false;
7655
7656   /* Do not try to vectorize bit-precision reductions.  */
7657   if (!type_has_mode_precision_p (op.type))
7658     return false;
7659
7660   /* For lane-reducing ops we're reducing the number of reduction PHIs
7661      which means the only use of that may be in the lane-reducing operation.  */
7662   if (lane_reduc_code_p
7663       && reduc_chain_length != 1
7664       && !only_slp_reduc_chain)
7665     {
7666       if (dump_enabled_p ())
7667         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7668                          "lane-reducing reduction with extra stmts.\n");
7669       return false;
7670     }
7671
7672   /* All uses but the last are expected to be defined in the loop.
7673      The last use is the reduction variable.  In case of nested cycle this
7674      assumption is not true: we use reduc_index to record the index of the
7675      reduction variable.  */
7676   slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
7677   tree *vectype_op = XALLOCAVEC (tree, op.num_ops);
7678   /* We need to skip an extra operand for COND_EXPRs with embedded
7679      comparison.  */
7680   unsigned opno_adjust = 0;
7681   if (op.code == COND_EXPR && COMPARISON_CLASS_P (op.ops[0]))
7682     opno_adjust = 1;
7683   for (i = 0; i < (int) op.num_ops; i++)
7684     {
7685       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
7686       if (i == 0 && op.code == COND_EXPR)
7687         continue;
7688
7689       stmt_vec_info def_stmt_info;
7690       enum vect_def_type dt;
7691       if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
7692                                i + opno_adjust, &op.ops[i], &slp_op[i], &dt,
7693                                &vectype_op[i], &def_stmt_info))
7694         {
7695           if (dump_enabled_p ())
7696             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7697                              "use not simple.\n");
7698           return false;
7699         }
7700       if (i == STMT_VINFO_REDUC_IDX (stmt_info))
7701         continue;
7702
7703       /* For an IFN_COND_OP we might hit the reduction definition operand
7704          twice (once as definition, once as else).  */
7705       if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
7706         continue;
7707
7708       /* There should be only one cycle def in the stmt, the one
7709          leading to reduc_def.  */
7710       if (VECTORIZABLE_CYCLE_DEF (dt))
7711         return false;
7712
7713       if (!vectype_op[i])
7714         vectype_op[i]
7715           = get_vectype_for_scalar_type (loop_vinfo,
7716                                          TREE_TYPE (op.ops[i]), slp_op[i]);
7717
7718       /* To properly compute ncopies we are interested in the widest
7719          non-reduction input type in case we're looking at a widening
7720          accumulation that we later handle in vect_transform_reduction.  */
7721       if (lane_reduc_code_p
7722           && vectype_op[i]
7723           && (!vectype_in
7724               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
7725                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_op[i]))))))
7726         vectype_in = vectype_op[i];
7727
7728       if (op.code == COND_EXPR)
7729         {
7730           /* Record how the non-reduction-def value of COND_EXPR is defined.  */
7731           if (dt == vect_constant_def)
7732             {
7733               cond_reduc_dt = dt;
7734               cond_reduc_val = op.ops[i];
7735             }
7736           if (dt == vect_induction_def
7737               && def_stmt_info
7738               && is_nonwrapping_integer_induction (def_stmt_info, loop))
7739             {
7740               cond_reduc_dt = dt;
7741               cond_stmt_vinfo = def_stmt_info;
7742             }
7743         }
7744     }
7745   if (!vectype_in)
7746     vectype_in = STMT_VINFO_VECTYPE (phi_info);
7747   STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
7748
7749   enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
7750   STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
7751   /* If we have a condition reduction, see if we can simplify it further.  */
7752   if (v_reduc_type == COND_REDUCTION)
7753     {
7754       if (slp_node)
7755         return false;
7756
7757       /* When the condition uses the reduction value in the condition, fail.  */
7758       if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
7759         {
7760           if (dump_enabled_p ())
7761             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7762                              "condition depends on previous iteration\n");
7763           return false;
7764         }
7765
7766       if (reduc_chain_length == 1
7767           && (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST, vectype_in,
7768                                               OPTIMIZE_FOR_SPEED)
7769               || direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
7770                                                  vectype_in,
7771                                                  OPTIMIZE_FOR_SPEED)))
7772         {
7773           if (dump_enabled_p ())
7774             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7775                              "optimizing condition reduction with"
7776                              " FOLD_EXTRACT_LAST.\n");
7777           STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
7778         }
7779       else if (cond_reduc_dt == vect_induction_def)
7780         {
7781           tree base
7782             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
7783           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
7784
7785           gcc_assert (TREE_CODE (base) == INTEGER_CST
7786                       && TREE_CODE (step) == INTEGER_CST);
7787           cond_reduc_val = NULL_TREE;
7788           enum tree_code cond_reduc_op_code = ERROR_MARK;
7789           tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
7790           if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
7791             ;
7792           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
7793              above base; punt if base is the minimum value of the type for
7794              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
7795           else if (tree_int_cst_sgn (step) == -1)
7796             {
7797               cond_reduc_op_code = MIN_EXPR;
7798               if (tree_int_cst_sgn (base) == -1)
7799                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7800               else if (tree_int_cst_lt (base,
7801                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
7802                 cond_reduc_val
7803                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
7804             }
7805           else
7806             {
7807               cond_reduc_op_code = MAX_EXPR;
7808               if (tree_int_cst_sgn (base) == 1)
7809                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7810               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
7811                                         base))
7812                 cond_reduc_val
7813                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
7814             }
7815           if (cond_reduc_val)
7816             {
7817               if (dump_enabled_p ())
7818                 dump_printf_loc (MSG_NOTE, vect_location,
7819                                  "condition expression based on "
7820                                  "integer induction.\n");
7821               STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
7822               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
7823                 = cond_reduc_val;
7824               STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
7825             }
7826         }
7827       else if (cond_reduc_dt == vect_constant_def)
7828         {
7829           enum vect_def_type cond_initial_dt;
7830           tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
7831           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
7832           if (cond_initial_dt == vect_constant_def
7833               && types_compatible_p (TREE_TYPE (cond_initial_val),
7834                                      TREE_TYPE (cond_reduc_val)))
7835             {
7836               tree e = fold_binary (LE_EXPR, boolean_type_node,
7837                                     cond_initial_val, cond_reduc_val);
7838               if (e && (integer_onep (e) || integer_zerop (e)))
7839                 {
7840                   if (dump_enabled_p ())
7841                     dump_printf_loc (MSG_NOTE, vect_location,
7842                                      "condition expression based on "
7843                                      "compile time constant.\n");
7844                   /* Record reduction code at analysis stage.  */
7845                   STMT_VINFO_REDUC_CODE (reduc_info)
7846                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
7847                   STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
7848                 }
7849             }
7850         }
7851     }
7852
7853   if (STMT_VINFO_LIVE_P (phi_info))
7854     return false;
7855
7856   if (slp_node)
7857     ncopies = 1;
7858   else
7859     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7860
7861   gcc_assert (ncopies >= 1);
7862
7863   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
7864
7865   if (nested_cycle)
7866     {
7867       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
7868                   == vect_double_reduction_def);
7869       double_reduc = true;
7870     }
7871
7872   /* 4.2. Check support for the epilog operation.
7873
7874           If STMT represents a reduction pattern, then the type of the
7875           reduction variable may be different than the type of the rest
7876           of the arguments.  For example, consider the case of accumulation
7877           of shorts into an int accumulator; The original code:
7878                         S1: int_a = (int) short_a;
7879           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
7880
7881           was replaced with:
7882                         STMT: int_acc = widen_sum <short_a, int_acc>
7883
7884           This means that:
7885           1. The tree-code that is used to create the vector operation in the
7886              epilog code (that reduces the partial results) is not the
7887              tree-code of STMT, but is rather the tree-code of the original
7888              stmt from the pattern that STMT is replacing.  I.e, in the example
7889              above we want to use 'widen_sum' in the loop, but 'plus' in the
7890              epilog.
7891           2. The type (mode) we use to check available target support
7892              for the vector operation to be created in the *epilog*, is
7893              determined by the type of the reduction variable (in the example
7894              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7895              However the type (mode) we use to check available target support
7896              for the vector operation to be created *inside the loop*, is
7897              determined by the type of the other arguments to STMT (in the
7898              example we'd check this: optab_handler (widen_sum_optab,
7899              vect_short_mode)).
7900
7901           This is contrary to "regular" reductions, in which the types of all
7902           the arguments are the same as the type of the reduction variable.
7903           For "regular" reductions we can therefore use the same vector type
7904           (and also the same tree-code) when generating the epilog code and
7905           when generating the code inside the loop.  */
7906
7907   code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
7908
7909   /* If conversion might have created a conditional operation like
7910      IFN_COND_ADD already.  Use the internal code for the following checks.  */
7911   if (orig_code.is_internal_fn ())
7912     {
7913       tree_code new_code = conditional_internal_fn_code (internal_fn (orig_code));
7914       orig_code = new_code != ERROR_MARK ? new_code : orig_code;
7915     }
7916
7917   STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
7918
7919   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7920   if (reduction_type == TREE_CODE_REDUCTION)
7921     {
7922       /* Check whether it's ok to change the order of the computation.
7923          Generally, when vectorizing a reduction we change the order of the
7924          computation.  This may change the behavior of the program in some
7925          cases, so we need to check that this is ok.  One exception is when
7926          vectorizing an outer-loop: the inner-loop is executed sequentially,
7927          and therefore vectorizing reductions in the inner-loop during
7928          outer-loop vectorization is safe.  Likewise when we are vectorizing
7929          a series of reductions using SLP and the VF is one the reductions
7930          are performed in scalar order.  */
7931       if (slp_node
7932           && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7933           && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
7934         ;
7935       else if (needs_fold_left_reduction_p (op.type, orig_code))
7936         {
7937           /* When vectorizing a reduction chain w/o SLP the reduction PHI
7938              is not directy used in stmt.  */
7939           if (!only_slp_reduc_chain
7940               && reduc_chain_length != 1)
7941             {
7942               if (dump_enabled_p ())
7943                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7944                                  "in-order reduction chain without SLP.\n");
7945               return false;
7946             }
7947           STMT_VINFO_REDUC_TYPE (reduc_info)
7948             = reduction_type = FOLD_LEFT_REDUCTION;
7949         }
7950       else if (!commutative_binary_op_p (orig_code, op.type)
7951                || !associative_binary_op_p (orig_code, op.type))
7952         {
7953           if (dump_enabled_p ())
7954             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7955                             "reduction: not commutative/associative\n");
7956           return false;
7957         }
7958     }
7959
7960   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7961       && ncopies > 1)
7962     {
7963       if (dump_enabled_p ())
7964         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7965                          "multiple types in double reduction or condition "
7966                          "reduction or fold-left reduction.\n");
7967       return false;
7968     }
7969
7970   internal_fn reduc_fn = IFN_LAST;
7971   if (reduction_type == TREE_CODE_REDUCTION
7972       || reduction_type == FOLD_LEFT_REDUCTION
7973       || reduction_type == INTEGER_INDUC_COND_REDUCTION
7974       || reduction_type == CONST_COND_REDUCTION)
7975     {
7976       if (reduction_type == FOLD_LEFT_REDUCTION
7977           ? fold_left_reduction_fn (orig_code, &reduc_fn)
7978           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7979         {
7980           if (reduc_fn != IFN_LAST
7981               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
7982                                                   OPTIMIZE_FOR_SPEED))
7983             {
7984               if (dump_enabled_p ())
7985                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7986                                  "reduc op not supported by target.\n");
7987
7988               reduc_fn = IFN_LAST;
7989             }
7990         }
7991       else
7992         {
7993           if (!nested_cycle || double_reduc)
7994             {
7995               if (dump_enabled_p ())
7996                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7997                                  "no reduc code for scalar code.\n");
7998
7999               return false;
8000             }
8001         }
8002     }
8003   else if (reduction_type == COND_REDUCTION)
8004     {
8005       int scalar_precision
8006         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
8007       cr_index_scalar_type = make_unsigned_type (scalar_precision);
8008       cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
8009                                                 vectype_out);
8010
8011       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
8012                                           OPTIMIZE_FOR_SPEED))
8013         reduc_fn = IFN_REDUC_MAX;
8014     }
8015   STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
8016
8017   if (reduction_type != EXTRACT_LAST_REDUCTION
8018       && (!nested_cycle || double_reduc)
8019       && reduc_fn == IFN_LAST
8020       && !nunits_out.is_constant ())
8021     {
8022       if (dump_enabled_p ())
8023         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8024                          "missing target support for reduction on"
8025                          " variable-length vectors.\n");
8026       return false;
8027     }
8028
8029   /* For SLP reductions, see if there is a neutral value we can use.  */
8030   tree neutral_op = NULL_TREE;
8031   if (slp_node)
8032     {
8033       tree initial_value = NULL_TREE;
8034       if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
8035         initial_value = vect_phi_initial_value (reduc_def_phi);
8036       neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8037                                              orig_code, initial_value);
8038     }
8039
8040   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
8041     {
8042       /* We can't support in-order reductions of code such as this:
8043
8044            for (int i = 0; i < n1; ++i)
8045              for (int j = 0; j < n2; ++j)
8046                l += a[j];
8047
8048          since GCC effectively transforms the loop when vectorizing:
8049
8050            for (int i = 0; i < n1 / VF; ++i)
8051              for (int j = 0; j < n2; ++j)
8052                for (int k = 0; k < VF; ++k)
8053                  l += a[j];
8054
8055          which is a reassociation of the original operation.  */
8056       if (dump_enabled_p ())
8057         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8058                          "in-order double reduction not supported.\n");
8059
8060       return false;
8061     }
8062
8063   if (reduction_type == FOLD_LEFT_REDUCTION
8064       && slp_node
8065       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
8066     {
8067       /* We cannot use in-order reductions in this case because there is
8068          an implicit reassociation of the operations involved.  */
8069       if (dump_enabled_p ())
8070         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8071                          "in-order unchained SLP reductions not supported.\n");
8072       return false;
8073     }
8074
8075   /* For double reductions, and for SLP reductions with a neutral value,
8076      we construct a variable-length initial vector by loading a vector
8077      full of the neutral value and then shift-and-inserting the start
8078      values into the low-numbered elements.  */
8079   if ((double_reduc || neutral_op)
8080       && !nunits_out.is_constant ()
8081       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
8082                                           vectype_out, OPTIMIZE_FOR_SPEED))
8083     {
8084       if (dump_enabled_p ())
8085         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8086                          "reduction on variable-length vectors requires"
8087                          " target support for a vector-shift-and-insert"
8088                          " operation.\n");
8089       return false;
8090     }
8091
8092   /* Check extra constraints for variable-length unchained SLP reductions.  */
8093   if (slp_node
8094       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
8095       && !nunits_out.is_constant ())
8096     {
8097       /* We checked above that we could build the initial vector when
8098          there's a neutral element value.  Check here for the case in
8099          which each SLP statement has its own initial value and in which
8100          that value needs to be repeated for every instance of the
8101          statement within the initial vector.  */
8102       unsigned int group_size = SLP_TREE_LANES (slp_node);
8103       if (!neutral_op
8104           && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
8105                                               TREE_TYPE (vectype_out)))
8106         {
8107           if (dump_enabled_p ())
8108             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8109                              "unsupported form of SLP reduction for"
8110                              " variable-length vectors: cannot build"
8111                              " initial vector.\n");
8112           return false;
8113         }
8114       /* The epilogue code relies on the number of elements being a multiple
8115          of the group size.  The duplicate-and-interleave approach to setting
8116          up the initial vector does too.  */
8117       if (!multiple_p (nunits_out, group_size))
8118         {
8119           if (dump_enabled_p ())
8120             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8121                              "unsupported form of SLP reduction for"
8122                              " variable-length vectors: the vector size"
8123                              " is not a multiple of the number of results.\n");
8124           return false;
8125         }
8126     }
8127
8128   if (reduction_type == COND_REDUCTION)
8129     {
8130       widest_int ni;
8131
8132       if (! max_loop_iterations (loop, &ni))
8133         {
8134           if (dump_enabled_p ())
8135             dump_printf_loc (MSG_NOTE, vect_location,
8136                              "loop count not known, cannot create cond "
8137                              "reduction.\n");
8138           return false;
8139         }
8140       /* Convert backedges to iterations.  */
8141       ni += 1;
8142
8143       /* The additional index will be the same type as the condition.  Check
8144          that the loop can fit into this less one (because we'll use up the
8145          zero slot for when there are no matches).  */
8146       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
8147       if (wi::geu_p (ni, wi::to_widest (max_index)))
8148         {
8149           if (dump_enabled_p ())
8150             dump_printf_loc (MSG_NOTE, vect_location,
8151                              "loop size is greater than data size.\n");
8152           return false;
8153         }
8154     }
8155
8156   /* In case the vectorization factor (VF) is bigger than the number
8157      of elements that we can fit in a vectype (nunits), we have to generate
8158      more than one vector stmt - i.e - we need to "unroll" the
8159      vector stmt by a factor VF/nunits.  For more details see documentation
8160      in vectorizable_operation.  */
8161
8162   /* If the reduction is used in an outer loop we need to generate
8163      VF intermediate results, like so (e.g. for ncopies=2):
8164         r0 = phi (init, r0)
8165         r1 = phi (init, r1)
8166         r0 = x0 + r0;
8167         r1 = x1 + r1;
8168     (i.e. we generate VF results in 2 registers).
8169     In this case we have a separate def-use cycle for each copy, and therefore
8170     for each copy we get the vector def for the reduction variable from the
8171     respective phi node created for this copy.
8172
8173     Otherwise (the reduction is unused in the loop nest), we can combine
8174     together intermediate results, like so (e.g. for ncopies=2):
8175         r = phi (init, r)
8176         r = x0 + r;
8177         r = x1 + r;
8178    (i.e. we generate VF/2 results in a single register).
8179    In this case for each copy we get the vector def for the reduction variable
8180    from the vectorized reduction operation generated in the previous iteration.
8181
8182    This only works when we see both the reduction PHI and its only consumer
8183    in vectorizable_reduction and there are no intermediate stmts
8184    participating.  When unrolling we want each unrolled iteration to have its
8185    own reduction accumulator since one of the main goals of unrolling a
8186    reduction is to reduce the aggregate loop-carried latency.  */
8187   if (ncopies > 1
8188       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
8189       && reduc_chain_length == 1
8190       && loop_vinfo->suggested_unroll_factor == 1)
8191     single_defuse_cycle = true;
8192
8193   if (single_defuse_cycle || lane_reduc_code_p)
8194     {
8195       gcc_assert (op.code != COND_EXPR);
8196
8197       /* 4. Supportable by target?  */
8198       bool ok = true;
8199
8200       /* 4.1. check support for the operation in the loop
8201
8202          This isn't necessary for the lane reduction codes, since they
8203          can only be produced by pattern matching, and it's up to the
8204          pattern matcher to test for support.  The main reason for
8205          specifically skipping this step is to avoid rechecking whether
8206          mixed-sign dot-products can be implemented using signed
8207          dot-products.  */
8208       machine_mode vec_mode = TYPE_MODE (vectype_in);
8209       if (!lane_reduc_code_p
8210           && !directly_supported_p (op.code, vectype_in, optab_vector))
8211         {
8212           if (dump_enabled_p ())
8213             dump_printf (MSG_NOTE, "op not supported by target.\n");
8214           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
8215               || !vect_can_vectorize_without_simd_p (op.code))
8216             ok = false;
8217           else
8218             if (dump_enabled_p ())
8219               dump_printf (MSG_NOTE, "proceeding using word mode.\n");
8220         }
8221
8222       if (vect_emulated_vector_p (vectype_in)
8223           && !vect_can_vectorize_without_simd_p (op.code))
8224         {
8225           if (dump_enabled_p ())
8226             dump_printf (MSG_NOTE, "using word mode not possible.\n");
8227           return false;
8228         }
8229
8230       /* lane-reducing operations have to go through vect_transform_reduction.
8231          For the other cases try without the single cycle optimization.  */
8232       if (!ok)
8233         {
8234           if (lane_reduc_code_p)
8235             return false;
8236           else
8237             single_defuse_cycle = false;
8238         }
8239     }
8240   STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
8241
8242   /* If the reduction stmt is one of the patterns that have lane
8243      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
8244   if ((ncopies > 1 && ! single_defuse_cycle)
8245       && lane_reduc_code_p)
8246     {
8247       if (dump_enabled_p ())
8248         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8249                          "multi def-use cycle not possible for lane-reducing "
8250                          "reduction operation\n");
8251       return false;
8252     }
8253
8254   if (slp_node
8255       && !(!single_defuse_cycle
8256            && !lane_reduc_code_p
8257            && reduction_type != FOLD_LEFT_REDUCTION))
8258     for (i = 0; i < (int) op.num_ops; i++)
8259       if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
8260         {
8261           if (dump_enabled_p ())
8262             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8263                              "incompatible vector types for invariants\n");
8264           return false;
8265         }
8266
8267   if (slp_node)
8268     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8269   else
8270     vec_num = 1;
8271
8272   vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
8273                              reduction_type, ncopies, cost_vec);
8274   /* Cost the reduction op inside the loop if transformed via
8275      vect_transform_reduction.  Otherwise this is costed by the
8276      separate vectorizable_* routines.  */
8277   if (single_defuse_cycle || lane_reduc_code_p)
8278     {
8279       int factor = 1;
8280       if (vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info))
8281         /* Three dot-products and a subtraction.  */
8282         factor = 4;
8283       record_stmt_cost (cost_vec, ncopies * factor, vector_stmt,
8284                         stmt_info, 0, vect_body);
8285     }
8286
8287   if (dump_enabled_p ()
8288       && reduction_type == FOLD_LEFT_REDUCTION)
8289     dump_printf_loc (MSG_NOTE, vect_location,
8290                      "using an in-order (fold-left) reduction.\n");
8291   STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
8292   /* All but single defuse-cycle optimized, lane-reducing and fold-left
8293      reductions go through their own vectorizable_* routines.  */
8294   if (!single_defuse_cycle
8295       && !lane_reduc_code_p
8296       && reduction_type != FOLD_LEFT_REDUCTION)
8297     {
8298       stmt_vec_info tem
8299         = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
8300       if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
8301         {
8302           gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
8303           tem = REDUC_GROUP_FIRST_ELEMENT (tem);
8304         }
8305       STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
8306       STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
8307     }
8308   else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8309     {
8310       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8311       vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8312       internal_fn cond_fn = get_conditional_internal_fn (op.code, op.type);
8313
8314       if (reduction_type != FOLD_LEFT_REDUCTION
8315           && !use_mask_by_cond_expr_p (op.code, cond_fn, vectype_in)
8316           && (cond_fn == IFN_LAST
8317               || !direct_internal_fn_supported_p (cond_fn, vectype_in,
8318                                                   OPTIMIZE_FOR_SPEED)))
8319         {
8320           if (dump_enabled_p ())
8321             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8322                              "can't operate on partial vectors because"
8323                              " no conditional operation is available.\n");
8324           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8325         }
8326       else if (reduction_type == FOLD_LEFT_REDUCTION
8327                && reduc_fn == IFN_LAST
8328                && !expand_vec_cond_expr_p (vectype_in,
8329                                            truth_type_for (vectype_in),
8330                                            SSA_NAME))
8331         {
8332           if (dump_enabled_p ())
8333             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8334                              "can't operate on partial vectors because"
8335                              " no conditional operation is available.\n");
8336           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8337         }
8338       else if (reduction_type == FOLD_LEFT_REDUCTION
8339                && internal_fn_mask_index (reduc_fn) == -1
8340                && FLOAT_TYPE_P (vectype_in)
8341                && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in))
8342         {
8343           if (dump_enabled_p ())
8344             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8345                              "can't operate on partial vectors because"
8346                              " signed zeros cannot be preserved.\n");
8347           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8348         }
8349       else
8350         {
8351           internal_fn mask_reduc_fn
8352             = get_masked_reduction_fn (reduc_fn, vectype_in);
8353
8354           if (mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
8355             vect_record_loop_len (loop_vinfo, lens, ncopies * vec_num,
8356                                   vectype_in, 1);
8357           else
8358             vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
8359                                    vectype_in, NULL);
8360         }
8361     }
8362   return true;
8363 }
8364
8365 /* STMT_INFO is a dot-product reduction whose multiplication operands
8366    have different signs.  Emit a sequence to emulate the operation
8367    using a series of signed DOT_PROD_EXPRs and return the last
8368    statement generated.  VEC_DEST is the result of the vector operation
8369    and VOP lists its inputs.  */
8370
8371 static gassign *
8372 vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8373                              gimple_stmt_iterator *gsi, tree vec_dest,
8374                              tree vop[3])
8375 {
8376   tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest));
8377   tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0]));
8378   tree narrow_elttype = TREE_TYPE (narrow_vectype);
8379   gimple *new_stmt;
8380
8381   /* Make VOP[0] the unsigned operand VOP[1] the signed operand.  */
8382   if (!TYPE_UNSIGNED (TREE_TYPE (vop[0])))
8383     std::swap (vop[0], vop[1]);
8384
8385   /* Convert all inputs to signed types.  */
8386   for (int i = 0; i < 3; ++i)
8387     if (TYPE_UNSIGNED (TREE_TYPE (vop[i])))
8388       {
8389         tree tmp = make_ssa_name (signed_type_for (TREE_TYPE (vop[i])));
8390         new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]);
8391         vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8392         vop[i] = tmp;
8393       }
8394
8395   /* In the comments below we assume 8-bit inputs for simplicity,
8396      but the approach works for any full integer type.  */
8397
8398   /* Create a vector of -128.  */
8399   tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype);
8400   tree min_narrow = build_vector_from_val (narrow_vectype,
8401                                            min_narrow_elttype);
8402
8403   /* Create a vector of 64.  */
8404   auto half_wi = wi::lrshift (wi::to_wide (min_narrow_elttype), 1);
8405   tree half_narrow = wide_int_to_tree (narrow_elttype, half_wi);
8406   half_narrow = build_vector_from_val (narrow_vectype, half_narrow);
8407
8408   /* Emit: SUB_RES = VOP[0] - 128.  */
8409   tree sub_res = make_ssa_name (narrow_vectype);
8410   new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow);
8411   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8412
8413   /* Emit:
8414
8415        STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
8416        STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
8417        STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
8418
8419      on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
8420      Doing the two 64 * y steps first allows more time to compute x.  */
8421   tree stage1 = make_ssa_name (wide_vectype);
8422   new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR,
8423                                   vop[1], half_narrow, vop[2]);
8424   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8425
8426   tree stage2 = make_ssa_name (wide_vectype);
8427   new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR,
8428                                   vop[1], half_narrow, stage1);
8429   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8430
8431   tree stage3 = make_ssa_name (wide_vectype);
8432   new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR,
8433                                   sub_res, vop[1], stage2);
8434   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8435
8436   /* Convert STAGE3 to the reduction type.  */
8437   return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3);
8438 }
8439
8440 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
8441    value.  */
8442
8443 bool
8444 vect_transform_reduction (loop_vec_info loop_vinfo,
8445                           stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
8446                           gimple **vec_stmt, slp_tree slp_node)
8447 {
8448   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8449   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8450   int i;
8451   int ncopies;
8452   int vec_num;
8453
8454   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8455   gcc_assert (reduc_info->is_reduc_info);
8456
8457   if (nested_in_vect_loop_p (loop, stmt_info))
8458     {
8459       loop = loop->inner;
8460       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
8461     }
8462
8463   gimple_match_op op;
8464   if (!gimple_extract_op (stmt_info->stmt, &op))
8465     gcc_unreachable ();
8466
8467   /* All uses but the last are expected to be defined in the loop.
8468      The last use is the reduction variable.  In case of nested cycle this
8469      assumption is not true: we use reduc_index to record the index of the
8470      reduction variable.  */
8471   stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
8472   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
8473   int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
8474   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8475
8476   if (slp_node)
8477     {
8478       ncopies = 1;
8479       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8480     }
8481   else
8482     {
8483       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8484       vec_num = 1;
8485     }
8486
8487   code_helper code = canonicalize_code (op.code, op.type);
8488   internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
8489
8490   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8491   vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8492   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
8493
8494   /* Transform.  */
8495   tree new_temp = NULL_TREE;
8496   auto_vec<tree> vec_oprnds0;
8497   auto_vec<tree> vec_oprnds1;
8498   auto_vec<tree> vec_oprnds2;
8499   tree def0;
8500
8501   if (dump_enabled_p ())
8502     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
8503
8504   /* FORNOW: Multiple types are not supported for condition.  */
8505   if (code == COND_EXPR)
8506     gcc_assert (ncopies == 1);
8507
8508   /* A binary COND_OP reduction must have the same definition and else
8509      value. */
8510   bool cond_fn_p = code.is_internal_fn ()
8511     && conditional_internal_fn_code (internal_fn (code)) != ERROR_MARK;
8512   if (cond_fn_p)
8513     {
8514       gcc_assert (code == IFN_COND_ADD || code == IFN_COND_SUB
8515                   || code == IFN_COND_MUL || code == IFN_COND_AND
8516                   || code == IFN_COND_IOR || code == IFN_COND_XOR);
8517       gcc_assert (op.num_ops == 4
8518                   && (op.ops[reduc_index]
8519                       == op.ops[internal_fn_else_index ((internal_fn) code)]));
8520     }
8521
8522   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8523
8524   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
8525   if (reduction_type == FOLD_LEFT_REDUCTION)
8526     {
8527       internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
8528       gcc_assert (code.is_tree_code () || cond_fn_p);
8529       return vectorize_fold_left_reduction
8530           (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
8531            code, reduc_fn, op.ops, op.num_ops, vectype_in,
8532            reduc_index, masks, lens);
8533     }
8534
8535   bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
8536   gcc_assert (single_defuse_cycle
8537               || code == DOT_PROD_EXPR
8538               || code == WIDEN_SUM_EXPR
8539               || code == SAD_EXPR);
8540
8541   /* Create the destination vector  */
8542   tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
8543   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
8544
8545   /* Get NCOPIES vector definitions for all operands except the reduction
8546      definition.  */
8547   if (!cond_fn_p)
8548     {
8549       vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
8550                          single_defuse_cycle && reduc_index == 0
8551                          ? NULL_TREE : op.ops[0], &vec_oprnds0,
8552                          single_defuse_cycle && reduc_index == 1
8553                          ? NULL_TREE : op.ops[1], &vec_oprnds1,
8554                          op.num_ops == 3
8555                          && !(single_defuse_cycle && reduc_index == 2)
8556                          ? op.ops[2] : NULL_TREE, &vec_oprnds2);
8557     }
8558   else
8559     {
8560       /* For a conditional operation pass the truth type as mask
8561          vectype.  */
8562       gcc_assert (single_defuse_cycle
8563                   && (reduc_index == 1 || reduc_index == 2));
8564       vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
8565                          op.ops[0], truth_type_for (vectype_in), &vec_oprnds0,
8566                          reduc_index == 1 ? NULL_TREE : op.ops[1],
8567                          NULL_TREE, &vec_oprnds1,
8568                          reduc_index == 2 ? NULL_TREE : op.ops[2],
8569                          NULL_TREE, &vec_oprnds2);
8570     }
8571
8572   /* For single def-use cycles get one copy of the vectorized reduction
8573      definition.  */
8574   if (single_defuse_cycle)
8575     {
8576       gcc_assert (!slp_node);
8577       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8578                                      op.ops[reduc_index],
8579                                      reduc_index == 0 ? &vec_oprnds0
8580                                      : (reduc_index == 1 ? &vec_oprnds1
8581                                         : &vec_oprnds2));
8582     }
8583
8584   bool emulated_mixed_dot_prod
8585     = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
8586   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
8587     {
8588       gimple *new_stmt;
8589       tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
8590       if (masked_loop_p && !mask_by_cond_expr)
8591         {
8592           /* No conditional ifns have been defined for dot-product yet.  */
8593           gcc_assert (code != DOT_PROD_EXPR);
8594
8595           /* Make sure that the reduction accumulator is vop[0].  */
8596           if (reduc_index == 1)
8597             {
8598               gcc_assert (commutative_binary_op_p (code, op.type));
8599               std::swap (vop[0], vop[1]);
8600             }
8601           tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8602                                           vec_num * ncopies, vectype_in, i);
8603           gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
8604                                                     vop[0], vop[1], vop[0]);
8605           new_temp = make_ssa_name (vec_dest, call);
8606           gimple_call_set_lhs (call, new_temp);
8607           gimple_call_set_nothrow (call, true);
8608           vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
8609           new_stmt = call;
8610         }
8611       else
8612         {
8613           if (op.num_ops >= 3)
8614             vop[2] = vec_oprnds2[i];
8615
8616           if (masked_loop_p && mask_by_cond_expr)
8617             {
8618               tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8619                                               vec_num * ncopies, vectype_in, i);
8620               build_vect_cond_expr (code, vop, mask, gsi);
8621             }
8622
8623           if (emulated_mixed_dot_prod)
8624             new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
8625                                                     vec_dest, vop);
8626
8627           else if (code.is_internal_fn () && !cond_fn_p)
8628             new_stmt = gimple_build_call_internal (internal_fn (code),
8629                                                    op.num_ops,
8630                                                    vop[0], vop[1], vop[2]);
8631           else if (code.is_internal_fn () && cond_fn_p)
8632             new_stmt = gimple_build_call_internal (internal_fn (code),
8633                                                    op.num_ops,
8634                                                    vop[0], vop[1], vop[2],
8635                                                    vop[1]);
8636           else
8637             new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
8638                                             vop[0], vop[1], vop[2]);
8639           new_temp = make_ssa_name (vec_dest, new_stmt);
8640           gimple_set_lhs (new_stmt, new_temp);
8641           vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8642         }
8643
8644       if (slp_node)
8645         slp_node->push_vec_def (new_stmt);
8646       else if (single_defuse_cycle
8647                && i < ncopies - 1)
8648         {
8649           if (reduc_index == 0)
8650             vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
8651           else if (reduc_index == 1)
8652             vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
8653           else if (reduc_index == 2)
8654             vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
8655         }
8656       else
8657         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8658     }
8659
8660   if (!slp_node)
8661     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8662
8663   return true;
8664 }
8665
8666 /* Transform phase of a cycle PHI.  */
8667
8668 bool
8669 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
8670                           stmt_vec_info stmt_info, gimple **vec_stmt,
8671                           slp_tree slp_node, slp_instance slp_node_instance)
8672 {
8673   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8674   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8675   int i;
8676   int ncopies;
8677   int j;
8678   bool nested_cycle = false;
8679   int vec_num;
8680
8681   if (nested_in_vect_loop_p (loop, stmt_info))
8682     {
8683       loop = loop->inner;
8684       nested_cycle = true;
8685     }
8686
8687   stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
8688   reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
8689   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8690   gcc_assert (reduc_info->is_reduc_info);
8691
8692   if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
8693       || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
8694     /* Leave the scalar phi in place.  */
8695     return true;
8696
8697   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8698   /* For a nested cycle we do not fill the above.  */
8699   if (!vectype_in)
8700     vectype_in = STMT_VINFO_VECTYPE (stmt_info);
8701   gcc_assert (vectype_in);
8702
8703   if (slp_node)
8704     {
8705       /* The size vect_schedule_slp_instance computes is off for us.  */
8706       vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
8707                                       * SLP_TREE_LANES (slp_node), vectype_in);
8708       ncopies = 1;
8709     }
8710   else
8711     {
8712       vec_num = 1;
8713       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8714     }
8715
8716   /* Check whether we should use a single PHI node and accumulate
8717      vectors to one before the backedge.  */
8718   if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
8719     ncopies = 1;
8720
8721   /* Create the destination vector  */
8722   gphi *phi = as_a <gphi *> (stmt_info->stmt);
8723   tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
8724                                                vectype_out);
8725
8726   /* Get the loop-entry arguments.  */
8727   tree vec_initial_def = NULL_TREE;
8728   auto_vec<tree> vec_initial_defs;
8729   if (slp_node)
8730     {
8731       vec_initial_defs.reserve (vec_num);
8732       if (nested_cycle)
8733         {
8734           unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
8735           vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
8736                              &vec_initial_defs);
8737         }
8738       else
8739         {
8740           gcc_assert (slp_node == slp_node_instance->reduc_phis);
8741           vec<tree> &initial_values = reduc_info->reduc_initial_values;
8742           vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
8743
8744           unsigned int num_phis = stmts.length ();
8745           if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
8746             num_phis = 1;
8747           initial_values.reserve (num_phis);
8748           for (unsigned int i = 0; i < num_phis; ++i)
8749             {
8750               gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
8751               initial_values.quick_push (vect_phi_initial_value (this_phi));
8752             }
8753           if (vec_num == 1)
8754             vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8755           if (!initial_values.is_empty ())
8756             {
8757               tree initial_value
8758                 = (num_phis == 1 ? initial_values[0] : NULL_TREE);
8759               code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8760               tree neutral_op
8761                 = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8762                                             code, initial_value);
8763               get_initial_defs_for_reduction (loop_vinfo, reduc_info,
8764                                               &vec_initial_defs, vec_num,
8765                                               stmts.length (), neutral_op);
8766             }
8767         }
8768     }
8769   else
8770     {
8771       /* Get at the scalar def before the loop, that defines the initial
8772          value of the reduction variable.  */
8773       tree initial_def = vect_phi_initial_value (phi);
8774       reduc_info->reduc_initial_values.safe_push (initial_def);
8775       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
8776          and we can't use zero for induc_val, use initial_def.  Similarly
8777          for REDUC_MIN and initial_def larger than the base.  */
8778       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
8779         {
8780           tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
8781           if (TREE_CODE (initial_def) == INTEGER_CST
8782               && !integer_zerop (induc_val)
8783               && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
8784                    && tree_int_cst_lt (initial_def, induc_val))
8785                   || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
8786                       && tree_int_cst_lt (induc_val, initial_def))))
8787             {
8788               induc_val = initial_def;
8789               /* Communicate we used the initial_def to epilouge
8790                  generation.  */
8791               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
8792             }
8793           vec_initial_def = build_vector_from_val (vectype_out, induc_val);
8794         }
8795       else if (nested_cycle)
8796         {
8797           /* Do not use an adjustment def as that case is not supported
8798              correctly if ncopies is not one.  */
8799           vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
8800                                          ncopies, initial_def,
8801                                          &vec_initial_defs);
8802         }
8803       else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
8804                || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
8805         /* Fill the initial vector with the initial scalar value.  */
8806         vec_initial_def
8807           = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
8808                                            initial_def, initial_def);
8809       else
8810         {
8811           if (ncopies == 1)
8812             vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8813           if (!reduc_info->reduc_initial_values.is_empty ())
8814             {
8815               initial_def = reduc_info->reduc_initial_values[0];
8816               code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8817               tree neutral_op
8818                 = neutral_op_for_reduction (TREE_TYPE (initial_def),
8819                                             code, initial_def);
8820               gcc_assert (neutral_op);
8821               /* Try to simplify the vector initialization by applying an
8822                  adjustment after the reduction has been performed.  */
8823               if (!reduc_info->reused_accumulator
8824                   && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8825                   && !operand_equal_p (neutral_op, initial_def))
8826                 {
8827                   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
8828                     = initial_def;
8829                   initial_def = neutral_op;
8830                 }
8831               vec_initial_def
8832                 = get_initial_def_for_reduction (loop_vinfo, reduc_info,
8833                                                  initial_def, neutral_op);
8834             }
8835         }
8836     }
8837
8838   if (vec_initial_def)
8839     {
8840       vec_initial_defs.create (ncopies);
8841       for (i = 0; i < ncopies; ++i)
8842         vec_initial_defs.quick_push (vec_initial_def);
8843     }
8844
8845   if (auto *accumulator = reduc_info->reused_accumulator)
8846     {
8847       tree def = accumulator->reduc_input;
8848       if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8849         {
8850           unsigned int nreduc;
8851           bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
8852                                             (TREE_TYPE (def)),
8853                                           TYPE_VECTOR_SUBPARTS (vectype_out),
8854                                           &nreduc);
8855           gcc_assert (res);
8856           gimple_seq stmts = NULL;
8857           /* Reduce the single vector to a smaller one.  */
8858           if (nreduc != 1)
8859             {
8860               /* Perform the reduction in the appropriate type.  */
8861               tree rvectype = vectype_out;
8862               if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
8863                                               TREE_TYPE (TREE_TYPE (def))))
8864                 rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
8865                                               TYPE_VECTOR_SUBPARTS
8866                                                 (vectype_out));
8867               def = vect_create_partial_epilog (def, rvectype,
8868                                                 STMT_VINFO_REDUC_CODE
8869                                                   (reduc_info),
8870                                                 &stmts);
8871             }
8872           /* The epilogue loop might use a different vector mode, like
8873              VNx2DI vs. V2DI.  */
8874           if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
8875             {
8876               tree reduc_type = build_vector_type_for_mode
8877                 (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
8878               def = gimple_convert (&stmts, reduc_type, def);
8879             }
8880           /* Adjust the input so we pick up the partially reduced value
8881              for the skip edge in vect_create_epilog_for_reduction.  */
8882           accumulator->reduc_input = def;
8883           /* And the reduction could be carried out using a different sign.  */
8884           if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8885             def = gimple_convert (&stmts, vectype_out, def);
8886           if (loop_vinfo->main_loop_edge)
8887             {
8888               /* While we'd like to insert on the edge this will split
8889                  blocks and disturb bookkeeping, we also will eventually
8890                  need this on the skip edge.  Rely on sinking to
8891                  fixup optimal placement and insert in the pred.  */
8892               gimple_stmt_iterator gsi
8893                 = gsi_last_bb (loop_vinfo->main_loop_edge->src);
8894               /* Insert before a cond that eventually skips the
8895                  epilogue.  */
8896               if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
8897                 gsi_prev (&gsi);
8898               gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
8899             }
8900           else
8901             gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
8902                                               stmts);
8903         }
8904       if (loop_vinfo->main_loop_edge)
8905         vec_initial_defs[0]
8906           = vect_get_main_loop_result (loop_vinfo, def,
8907                                        vec_initial_defs[0]);
8908       else
8909         vec_initial_defs.safe_push (def);
8910     }
8911
8912   /* Generate the reduction PHIs upfront.  */
8913   for (i = 0; i < vec_num; i++)
8914     {
8915       tree vec_init_def = vec_initial_defs[i];
8916       for (j = 0; j < ncopies; j++)
8917         {
8918           /* Create the reduction-phi that defines the reduction
8919              operand.  */
8920           gphi *new_phi = create_phi_node (vec_dest, loop->header);
8921
8922           /* Set the loop-entry arg of the reduction-phi.  */
8923           if (j != 0 && nested_cycle)
8924             vec_init_def = vec_initial_defs[j];
8925           add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
8926                        UNKNOWN_LOCATION);
8927
8928           /* The loop-latch arg is set in epilogue processing.  */
8929
8930           if (slp_node)
8931             slp_node->push_vec_def (new_phi);
8932           else
8933             {
8934               if (j == 0)
8935                 *vec_stmt = new_phi;
8936               STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8937             }
8938         }
8939     }
8940
8941   return true;
8942 }
8943
8944 /* Vectorizes LC PHIs.  */
8945
8946 bool
8947 vectorizable_lc_phi (loop_vec_info loop_vinfo,
8948                      stmt_vec_info stmt_info, gimple **vec_stmt,
8949                      slp_tree slp_node)
8950 {
8951   if (!loop_vinfo
8952       || !is_a <gphi *> (stmt_info->stmt)
8953       || gimple_phi_num_args (stmt_info->stmt) != 1)
8954     return false;
8955
8956   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
8957       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
8958     return false;
8959
8960   if (!vec_stmt) /* transformation not required.  */
8961     {
8962       /* Deal with copies from externs or constants that disguise as
8963          loop-closed PHI nodes (PR97886).  */
8964       if (slp_node
8965           && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
8966                                                 SLP_TREE_VECTYPE (slp_node)))
8967         {
8968           if (dump_enabled_p ())
8969             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8970                              "incompatible vector types for invariants\n");
8971           return false;
8972         }
8973       STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
8974       return true;
8975     }
8976
8977   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8978   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8979   basic_block bb = gimple_bb (stmt_info->stmt);
8980   edge e = single_pred_edge (bb);
8981   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8982   auto_vec<tree> vec_oprnds;
8983   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
8984                      !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
8985                      gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
8986   for (unsigned i = 0; i < vec_oprnds.length (); i++)
8987     {
8988       /* Create the vectorized LC PHI node.  */
8989       gphi *new_phi = create_phi_node (vec_dest, bb);
8990       add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
8991       if (slp_node)
8992         slp_node->push_vec_def (new_phi);
8993       else
8994         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8995     }
8996   if (!slp_node)
8997     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8998
8999   return true;
9000 }
9001
9002 /* Vectorizes PHIs.  */
9003
9004 bool
9005 vectorizable_phi (vec_info *,
9006                   stmt_vec_info stmt_info, gimple **vec_stmt,
9007                   slp_tree slp_node, stmt_vector_for_cost *cost_vec)
9008 {
9009   if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
9010     return false;
9011
9012   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
9013     return false;
9014
9015   tree vectype = SLP_TREE_VECTYPE (slp_node);
9016
9017   if (!vec_stmt) /* transformation not required.  */
9018     {
9019       slp_tree child;
9020       unsigned i;
9021       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
9022         if (!child)
9023           {
9024             if (dump_enabled_p ())
9025               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9026                                "PHI node with unvectorized backedge def\n");
9027             return false;
9028           }
9029         else if (!vect_maybe_update_slp_op_vectype (child, vectype))
9030           {
9031             if (dump_enabled_p ())
9032               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9033                                "incompatible vector types for invariants\n");
9034             return false;
9035           }
9036         else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
9037                  && !useless_type_conversion_p (vectype,
9038                                                 SLP_TREE_VECTYPE (child)))
9039           {
9040             /* With bools we can have mask and non-mask precision vectors
9041                or different non-mask precisions.  while pattern recog is
9042                supposed to guarantee consistency here bugs in it can cause
9043                mismatches (PR103489 and PR103800 for example).
9044                Deal with them here instead of ICEing later.  */
9045             if (dump_enabled_p ())
9046               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9047                                "incompatible vector type setup from "
9048                                "bool pattern detection\n");
9049             return false;
9050           }
9051
9052       /* For single-argument PHIs assume coalescing which means zero cost
9053          for the scalar and the vector PHIs.  This avoids artificially
9054          favoring the vector path (but may pessimize it in some cases).  */
9055       if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
9056         record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
9057                           vector_stmt, stmt_info, vectype, 0, vect_body);
9058       STMT_VINFO_TYPE (stmt_info) = phi_info_type;
9059       return true;
9060     }
9061
9062   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
9063   basic_block bb = gimple_bb (stmt_info->stmt);
9064   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
9065   auto_vec<gphi *> new_phis;
9066   for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
9067     {
9068       slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
9069
9070       /* Skip not yet vectorized defs.  */
9071       if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
9072           && SLP_TREE_VEC_DEFS (child).is_empty ())
9073         continue;
9074
9075       auto_vec<tree> vec_oprnds;
9076       vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
9077       if (!new_phis.exists ())
9078         {
9079           new_phis.create (vec_oprnds.length ());
9080           for (unsigned j = 0; j < vec_oprnds.length (); j++)
9081             {
9082               /* Create the vectorized LC PHI node.  */
9083               new_phis.quick_push (create_phi_node (vec_dest, bb));
9084               slp_node->push_vec_def (new_phis[j]);
9085             }
9086         }
9087       edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
9088       for (unsigned j = 0; j < vec_oprnds.length (); j++)
9089         add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
9090     }
9091   /* We should have at least one already vectorized child.  */
9092   gcc_assert (new_phis.exists ());
9093
9094   return true;
9095 }
9096
9097 /* Vectorizes first order recurrences.  An overview of the transformation
9098    is described below. Suppose we have the following loop.
9099
9100      int t = 0;
9101      for (int i = 0; i < n; ++i)
9102        {
9103          b[i] = a[i] - t;
9104          t = a[i];
9105        }
9106
9107    There is a first-order recurrence on 'a'. For this loop, the scalar IR
9108    looks (simplified) like:
9109
9110     scalar.preheader:
9111       init = 0;
9112
9113     scalar.body:
9114       i = PHI <0(scalar.preheader), i+1(scalar.body)>
9115       _2 = PHI <(init(scalar.preheader), <_1(scalar.body)>
9116       _1 = a[i]
9117       b[i] = _1 - _2
9118       if (i < n) goto scalar.body
9119
9120    In this example, _2 is a recurrence because it's value depends on the
9121    previous iteration.  We vectorize this as (VF = 4)
9122
9123     vector.preheader:
9124       vect_init = vect_cst(..., ..., ..., 0)
9125
9126     vector.body
9127       i = PHI <0(vector.preheader), i+4(vector.body)>
9128       vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)>
9129       vect_2 = a[i, i+1, i+2, i+3];
9130       vect_3 = vec_perm (vect_1, vect_2, { 3, 4, 5, 6 })
9131       b[i, i+1, i+2, i+3] = vect_2 - vect_3
9132       if (..) goto vector.body
9133
9134    In this function, vectorizable_recurr, we code generate both the
9135    vector PHI node and the permute since those together compute the
9136    vectorized value of the scalar PHI.  We do not yet have the
9137    backedge value to fill in there nor into the vec_perm.  Those
9138    are filled in maybe_set_vectorized_backedge_value and
9139    vect_schedule_scc.
9140
9141    TODO:  Since the scalar loop does not have a use of the recurrence
9142    outside of the loop the natural way to implement peeling via
9143    vectorizing the live value doesn't work.  For now peeling of loops
9144    with a recurrence is not implemented.  For SLP the supported cases
9145    are restricted to those requiring a single vector recurrence PHI.  */
9146
9147 bool
9148 vectorizable_recurr (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
9149                      gimple **vec_stmt, slp_tree slp_node,
9150                      stmt_vector_for_cost *cost_vec)
9151 {
9152   if (!loop_vinfo || !is_a<gphi *> (stmt_info->stmt))
9153     return false;
9154
9155   gphi *phi = as_a<gphi *> (stmt_info->stmt);
9156
9157   /* So far we only support first-order recurrence auto-vectorization.  */
9158   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
9159     return false;
9160
9161   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9162   unsigned ncopies;
9163   if (slp_node)
9164     ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
9165   else
9166     ncopies = vect_get_num_copies (loop_vinfo, vectype);
9167   poly_int64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9168   unsigned dist = slp_node ? SLP_TREE_LANES (slp_node) : 1;
9169   /* We need to be able to make progress with a single vector.  */
9170   if (maybe_gt (dist * 2, nunits))
9171     {
9172       if (dump_enabled_p ())
9173         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9174                          "first order recurrence exceeds half of "
9175                          "a vector\n");
9176       return false;
9177     }
9178
9179   /* First-order recurrence autovectorization needs to handle permutation
9180      with indices = [nunits-1, nunits, nunits+1, ...].  */
9181   vec_perm_builder sel (nunits, 1, 3);
9182   for (int i = 0; i < 3; ++i)
9183     sel.quick_push (nunits - dist + i);
9184   vec_perm_indices indices (sel, 2, nunits);
9185
9186   if (!vec_stmt) /* transformation not required.  */
9187     {
9188       if (!can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
9189                                  indices))
9190         return false;
9191
9192       if (slp_node)
9193         {
9194           /* We eventually need to set a vector type on invariant
9195              arguments.  */
9196           unsigned j;
9197           slp_tree child;
9198           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9199             if (!vect_maybe_update_slp_op_vectype
9200                   (child, SLP_TREE_VECTYPE (slp_node)))
9201               {
9202                 if (dump_enabled_p ())
9203                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9204                                    "incompatible vector types for "
9205                                    "invariants\n");
9206                 return false;
9207               }
9208         }
9209       /* The recurrence costs the initialization vector and one permute
9210          for each copy.  */
9211       unsigned prologue_cost = record_stmt_cost (cost_vec, 1, scalar_to_vec,
9212                                                  stmt_info, 0, vect_prologue);
9213       unsigned inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9214                                                stmt_info, 0, vect_body);
9215       if (dump_enabled_p ())
9216         dump_printf_loc (MSG_NOTE, vect_location,
9217                          "vectorizable_recurr: inside_cost = %d, "
9218                          "prologue_cost = %d .\n", inside_cost,
9219                          prologue_cost);
9220
9221       STMT_VINFO_TYPE (stmt_info) = recurr_info_type;
9222       return true;
9223     }
9224
9225   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
9226   basic_block bb = gimple_bb (phi);
9227   tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
9228   if (!useless_type_conversion_p (TREE_TYPE (vectype), TREE_TYPE (preheader)))
9229     {
9230       gimple_seq stmts = NULL;
9231       preheader = gimple_convert (&stmts, TREE_TYPE (vectype), preheader);
9232       gsi_insert_seq_on_edge_immediate (pe, stmts);
9233     }
9234   tree vec_init = build_vector_from_val (vectype, preheader);
9235   vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL);
9236
9237   /* Create the vectorized first-order PHI node.  */
9238   tree vec_dest = vect_get_new_vect_var (vectype,
9239                                          vect_simple_var, "vec_recur_");
9240   gphi *new_phi = create_phi_node (vec_dest, bb);
9241   add_phi_arg (new_phi, vec_init, pe, UNKNOWN_LOCATION);
9242
9243   /* Insert shuffles the first-order recurrence autovectorization.
9244        result = VEC_PERM <vec_recur, vect_1, index[nunits-1, nunits, ...]>.  */
9245   tree perm = vect_gen_perm_mask_checked (vectype, indices);
9246
9247   /* Insert the required permute after the latch definition.  The
9248      second and later operands are tentative and will be updated when we have
9249      vectorized the latch definition.  */
9250   edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
9251   gimple *latch_def = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, le));
9252   gimple_stmt_iterator gsi2 = gsi_for_stmt (latch_def);
9253   gsi_next (&gsi2);
9254
9255   for (unsigned i = 0; i < ncopies; ++i)
9256     {
9257       vec_dest = make_ssa_name (vectype);
9258       gassign *vperm
9259           = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
9260                                  i == 0 ? gimple_phi_result (new_phi) : NULL,
9261                                  NULL, perm);
9262       vect_finish_stmt_generation (loop_vinfo, stmt_info, vperm, &gsi2);
9263
9264       if (slp_node)
9265         slp_node->push_vec_def (vperm);
9266       else
9267         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (vperm);
9268     }
9269
9270   if (!slp_node)
9271     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
9272   return true;
9273 }
9274
9275 /* Return true if VECTYPE represents a vector that requires lowering
9276    by the vector lowering pass.  */
9277
9278 bool
9279 vect_emulated_vector_p (tree vectype)
9280 {
9281   return (!VECTOR_MODE_P (TYPE_MODE (vectype))
9282           && (!VECTOR_BOOLEAN_TYPE_P (vectype)
9283               || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
9284 }
9285
9286 /* Return true if we can emulate CODE on an integer mode representation
9287    of a vector.  */
9288
9289 bool
9290 vect_can_vectorize_without_simd_p (tree_code code)
9291 {
9292   switch (code)
9293     {
9294     case PLUS_EXPR:
9295     case MINUS_EXPR:
9296     case NEGATE_EXPR:
9297     case BIT_AND_EXPR:
9298     case BIT_IOR_EXPR:
9299     case BIT_XOR_EXPR:
9300     case BIT_NOT_EXPR:
9301       return true;
9302
9303     default:
9304       return false;
9305     }
9306 }
9307
9308 /* Likewise, but taking a code_helper.  */
9309
9310 bool
9311 vect_can_vectorize_without_simd_p (code_helper code)
9312 {
9313   return (code.is_tree_code ()
9314           && vect_can_vectorize_without_simd_p (tree_code (code)));
9315 }
9316
9317 /* Create vector init for vectorized iv.  */
9318 static tree
9319 vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9320                                tree step_expr, poly_uint64 nunits,
9321                                tree vectype,
9322                                enum vect_induction_op_type induction_type)
9323 {
9324   unsigned HOST_WIDE_INT const_nunits;
9325   tree vec_shift, vec_init, new_name;
9326   unsigned i;
9327   tree itype = TREE_TYPE (vectype);
9328
9329   /* iv_loop is the loop to be vectorized. Create:
9330      vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr).  */
9331   new_name = gimple_convert (stmts, itype, init_expr);
9332   switch (induction_type)
9333     {
9334     case vect_step_op_shr:
9335     case vect_step_op_shl:
9336       /* Build the Initial value from shift_expr.  */
9337       vec_init = gimple_build_vector_from_val (stmts,
9338                                                vectype,
9339                                                new_name);
9340       vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
9341                                 build_zero_cst (itype), step_expr);
9342       vec_init = gimple_build (stmts,
9343                                (induction_type == vect_step_op_shr
9344                                 ? RSHIFT_EXPR : LSHIFT_EXPR),
9345                                vectype, vec_init, vec_shift);
9346       break;
9347
9348     case vect_step_op_neg:
9349       {
9350         vec_init = gimple_build_vector_from_val (stmts,
9351                                                  vectype,
9352                                                  new_name);
9353         tree vec_neg = gimple_build (stmts, NEGATE_EXPR,
9354                                      vectype, vec_init);
9355         /* The encoding has 2 interleaved stepped patterns.  */
9356         vec_perm_builder sel (nunits, 2, 3);
9357         sel.quick_grow (6);
9358         for (i = 0; i < 3; i++)
9359           {
9360             sel[2 * i] = i;
9361             sel[2 * i + 1] = i + nunits;
9362           }
9363         vec_perm_indices indices (sel, 2, nunits);
9364         /* Don't use vect_gen_perm_mask_checked since can_vec_perm_const_p may
9365            fail when vec_init is const vector. In that situation vec_perm is not
9366            really needed.  */
9367         tree perm_mask_even
9368           = vect_gen_perm_mask_any (vectype, indices);
9369         vec_init = gimple_build (stmts, VEC_PERM_EXPR,
9370                                  vectype,
9371                                  vec_init, vec_neg,
9372                                  perm_mask_even);
9373       }
9374       break;
9375
9376     case vect_step_op_mul:
9377       {
9378         /* Use unsigned mult to avoid UD integer overflow.  */
9379         gcc_assert (nunits.is_constant (&const_nunits));
9380         tree utype = unsigned_type_for (itype);
9381         tree uvectype = build_vector_type (utype,
9382                                            TYPE_VECTOR_SUBPARTS (vectype));
9383         new_name = gimple_convert (stmts, utype, new_name);
9384         vec_init = gimple_build_vector_from_val (stmts,
9385                                                  uvectype,
9386                                                  new_name);
9387         tree_vector_builder elts (uvectype, const_nunits, 1);
9388         tree elt_step = build_one_cst (utype);
9389
9390         elts.quick_push (elt_step);
9391         for (i = 1; i < const_nunits; i++)
9392           {
9393             /* Create: new_name_i = new_name + step_expr.  */
9394             elt_step = gimple_build (stmts, MULT_EXPR,
9395                                      utype, elt_step, step_expr);
9396             elts.quick_push (elt_step);
9397           }
9398         /* Create a vector from [new_name_0, new_name_1, ...,
9399            new_name_nunits-1].  */
9400         tree vec_mul = gimple_build_vector (stmts, &elts);
9401         vec_init = gimple_build (stmts, MULT_EXPR, uvectype,
9402                                  vec_init, vec_mul);
9403         vec_init = gimple_convert (stmts, vectype, vec_init);
9404       }
9405       break;
9406
9407     default:
9408       gcc_unreachable ();
9409     }
9410
9411   return vec_init;
9412 }
9413
9414 /* Peel init_expr by skip_niter for induction_type.  */
9415 tree
9416 vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9417                              tree skip_niters, tree step_expr,
9418                              enum vect_induction_op_type induction_type)
9419 {
9420   gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST);
9421   tree type = TREE_TYPE (init_expr);
9422   unsigned prec = TYPE_PRECISION (type);
9423   switch (induction_type)
9424     {
9425     case vect_step_op_neg:
9426       if (TREE_INT_CST_LOW (skip_niters) % 2)
9427         init_expr = gimple_build (stmts, NEGATE_EXPR, type, init_expr);
9428       /* else no change.  */
9429       break;
9430
9431     case vect_step_op_shr:
9432     case vect_step_op_shl:
9433       skip_niters = gimple_convert (stmts, type, skip_niters);
9434       step_expr = gimple_build (stmts, MULT_EXPR, type, step_expr, skip_niters);
9435       /* When shift mount >= precision, need to avoid UD.
9436          In the original loop, there's no UD, and according to semantic,
9437          init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr.  */
9438       if (!tree_fits_uhwi_p (step_expr)
9439           || tree_to_uhwi (step_expr) >= prec)
9440         {
9441           if (induction_type == vect_step_op_shl
9442               || TYPE_UNSIGNED (type))
9443             init_expr = build_zero_cst (type);
9444           else
9445             init_expr = gimple_build (stmts, RSHIFT_EXPR, type,
9446                                       init_expr,
9447                                       wide_int_to_tree (type, prec - 1));
9448         }
9449       else
9450         init_expr = gimple_build (stmts, (induction_type == vect_step_op_shr
9451                                           ? RSHIFT_EXPR : LSHIFT_EXPR),
9452                                   type, init_expr, step_expr);
9453       break;
9454
9455     case vect_step_op_mul:
9456       {
9457         tree utype = unsigned_type_for (type);
9458         init_expr = gimple_convert (stmts, utype, init_expr);
9459         wide_int skipn = wi::to_wide (skip_niters);
9460         wide_int begin = wi::to_wide (step_expr);
9461         auto_mpz base, exp, mod, res;
9462         wi::to_mpz (begin, base, TYPE_SIGN (type));
9463         wi::to_mpz (skipn, exp, UNSIGNED);
9464         mpz_ui_pow_ui (mod, 2, TYPE_PRECISION (type));
9465         mpz_powm (res, base, exp, mod);
9466         begin = wi::from_mpz (type, res, TYPE_SIGN (type));
9467         tree mult_expr = wide_int_to_tree (utype, begin);
9468         init_expr = gimple_build (stmts, MULT_EXPR, utype,
9469                                   init_expr, mult_expr);
9470         init_expr = gimple_convert (stmts, type, init_expr);
9471       }
9472       break;
9473
9474     default:
9475       gcc_unreachable ();
9476     }
9477
9478   return init_expr;
9479 }
9480
9481 /* Create vector step for vectorized iv.  */
9482 static tree
9483 vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
9484                                poly_uint64 vf,
9485                                enum vect_induction_op_type induction_type)
9486 {
9487   tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
9488   tree new_name = NULL;
9489   /* Step should be pow (step, vf) for mult induction.  */
9490   if (induction_type == vect_step_op_mul)
9491     {
9492       gcc_assert (vf.is_constant ());
9493       wide_int begin = wi::to_wide (step_expr);
9494
9495       for (unsigned i = 0; i != vf.to_constant () - 1; i++)
9496         begin = wi::mul (begin, wi::to_wide (step_expr));
9497
9498       new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin);
9499     }
9500   else if (induction_type == vect_step_op_neg)
9501     /* Do nothing.  */
9502     ;
9503   else
9504     new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr),
9505                              expr, step_expr);
9506   return new_name;
9507 }
9508
9509 static tree
9510 vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
9511                                    stmt_vec_info stmt_info,
9512                                    tree new_name, tree vectype,
9513                                    enum vect_induction_op_type induction_type)
9514 {
9515   /* No step is needed for neg induction.  */
9516   if (induction_type == vect_step_op_neg)
9517     return NULL;
9518
9519   tree t = unshare_expr (new_name);
9520   gcc_assert (CONSTANT_CLASS_P (new_name)
9521               || TREE_CODE (new_name) == SSA_NAME);
9522   tree new_vec = build_vector_from_val (vectype, t);
9523   tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
9524                                     new_vec, vectype, NULL);
9525   return vec_step;
9526 }
9527
9528 /* Update vectorized iv with vect_step, induc_def is init.  */
9529 static tree
9530 vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
9531                           tree induc_def, tree vec_step,
9532                           enum vect_induction_op_type induction_type)
9533 {
9534   tree vec_def = induc_def;
9535   switch (induction_type)
9536     {
9537     case vect_step_op_mul:
9538       {
9539         /* Use unsigned mult to avoid UD integer overflow.  */
9540         tree uvectype
9541           = build_vector_type (unsigned_type_for (TREE_TYPE (vectype)),
9542                                TYPE_VECTOR_SUBPARTS (vectype));
9543         vec_def = gimple_convert (stmts, uvectype, vec_def);
9544         vec_step = gimple_convert (stmts, uvectype, vec_step);
9545         vec_def = gimple_build (stmts, MULT_EXPR, uvectype,
9546                                 vec_def, vec_step);
9547         vec_def = gimple_convert (stmts, vectype, vec_def);
9548       }
9549       break;
9550
9551     case vect_step_op_shr:
9552       vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype,
9553                               vec_def, vec_step);
9554       break;
9555
9556     case vect_step_op_shl:
9557       vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype,
9558                               vec_def, vec_step);
9559       break;
9560     case vect_step_op_neg:
9561       vec_def = induc_def;
9562       /* Do nothing.  */
9563       break;
9564     default:
9565       gcc_unreachable ();
9566     }
9567
9568   return vec_def;
9569
9570 }
9571
9572 /* Function vectorizable_induction
9573
9574    Check if STMT_INFO performs an nonlinear induction computation that can be
9575    vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
9576    a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
9577    basic block.
9578    Return true if STMT_INFO is vectorizable in this way.  */
9579
9580 static bool
9581 vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
9582                                   stmt_vec_info stmt_info,
9583                                   gimple **vec_stmt, slp_tree slp_node,
9584                                   stmt_vector_for_cost *cost_vec)
9585 {
9586   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9587   unsigned ncopies;
9588   bool nested_in_vect_loop = false;
9589   class loop *iv_loop;
9590   tree vec_def;
9591   edge pe = loop_preheader_edge (loop);
9592   basic_block new_bb;
9593   tree vec_init, vec_step;
9594   tree new_name;
9595   gimple *new_stmt;
9596   gphi *induction_phi;
9597   tree induc_def, vec_dest;
9598   tree init_expr, step_expr;
9599   tree niters_skip;
9600   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9601   unsigned i;
9602   gimple_stmt_iterator si;
9603
9604   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9605
9606   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9607   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9608   enum vect_induction_op_type induction_type
9609     = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9610
9611   gcc_assert (induction_type > vect_step_op_add);
9612
9613   if (slp_node)
9614     ncopies = 1;
9615   else
9616     ncopies = vect_get_num_copies (loop_vinfo, vectype);
9617   gcc_assert (ncopies >= 1);
9618
9619   /* FORNOW. Only handle nonlinear induction in the same loop.  */
9620   if (nested_in_vect_loop_p (loop, stmt_info))
9621     {
9622       if (dump_enabled_p ())
9623         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9624                          "nonlinear induction in nested loop.\n");
9625       return false;
9626     }
9627
9628   iv_loop = loop;
9629   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9630
9631   /* TODO: Support slp for nonlinear iv. There should be separate vector iv
9632      update for each iv and a permutation to generate wanted vector iv.  */
9633   if (slp_node)
9634     {
9635       if (dump_enabled_p ())
9636         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9637                          "SLP induction not supported for nonlinear"
9638                          " induction.\n");
9639       return false;
9640     }
9641
9642   if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
9643     {
9644       if (dump_enabled_p ())
9645         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9646                          "floating point nonlinear induction vectorization"
9647                          " not supported.\n");
9648       return false;
9649     }
9650
9651   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9652   init_expr = vect_phi_initial_value (phi);
9653   gcc_assert (step_expr != NULL_TREE && init_expr != NULL
9654               && TREE_CODE (step_expr) == INTEGER_CST);
9655   /* step_expr should be aligned with init_expr,
9656      .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used.  */
9657   step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
9658
9659   if (TREE_CODE (init_expr) == INTEGER_CST)
9660     init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
9661   else if (!tree_nop_conversion_p (TREE_TYPE (vectype), TREE_TYPE (init_expr)))
9662     {
9663       /* INIT_EXPR could be a bit_field, bail out for such case.  */
9664       if (dump_enabled_p ())
9665         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9666                          "nonlinear induction vectorization failed:"
9667                          " component type of vectype is not a nop conversion"
9668                          " from type of init_expr.\n");
9669       return false;
9670     }
9671
9672   switch (induction_type)
9673     {
9674     case vect_step_op_neg:
9675       if (TREE_CODE (init_expr) != INTEGER_CST
9676           && TREE_CODE (init_expr) != REAL_CST)
9677         {
9678           /* Check for backend support of NEGATE_EXPR and vec_perm.  */
9679           if (!directly_supported_p (NEGATE_EXPR, vectype))
9680             return false;
9681
9682           /* The encoding has 2 interleaved stepped patterns.  */
9683           vec_perm_builder sel (nunits, 2, 3);
9684           machine_mode mode = TYPE_MODE (vectype);
9685           sel.quick_grow (6);
9686           for (i = 0; i < 3; i++)
9687             {
9688               sel[i * 2] = i;
9689               sel[i * 2 + 1] = i + nunits;
9690             }
9691           vec_perm_indices indices (sel, 2, nunits);
9692           if (!can_vec_perm_const_p (mode, mode, indices))
9693             return false;
9694         }
9695       break;
9696
9697     case vect_step_op_mul:
9698       {
9699         /* Check for backend support of MULT_EXPR.  */
9700         if (!directly_supported_p (MULT_EXPR, vectype))
9701           return false;
9702
9703         /* ?? How to construct vector step for variable number vector.
9704            [ 1, step, pow (step, 2), pow (step, 4), .. ].  */
9705         if (!vf.is_constant ())
9706           return false;
9707       }
9708       break;
9709
9710     case vect_step_op_shr:
9711       /* Check for backend support of RSHIFT_EXPR.  */
9712       if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
9713         return false;
9714
9715       /* Don't shift more than type precision to avoid UD.  */
9716       if (!tree_fits_uhwi_p (step_expr)
9717           || maybe_ge (nunits * tree_to_uhwi (step_expr),
9718                        TYPE_PRECISION (TREE_TYPE (init_expr))))
9719         return false;
9720       break;
9721
9722     case vect_step_op_shl:
9723       /* Check for backend support of RSHIFT_EXPR.  */
9724       if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
9725         return false;
9726
9727       /* Don't shift more than type precision to avoid UD.  */
9728       if (!tree_fits_uhwi_p (step_expr)
9729           || maybe_ge (nunits * tree_to_uhwi (step_expr),
9730                        TYPE_PRECISION (TREE_TYPE (init_expr))))
9731         return false;
9732
9733       break;
9734
9735     default:
9736       gcc_unreachable ();
9737     }
9738
9739   if (!vec_stmt) /* transformation not required.  */
9740     {
9741       unsigned inside_cost = 0, prologue_cost = 0;
9742       /* loop cost for vec_loop. Neg induction doesn't have any
9743          inside_cost.  */
9744       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9745                                       stmt_info, 0, vect_body);
9746
9747       /* loop cost for vec_loop. Neg induction doesn't have any
9748          inside_cost.  */
9749       if (induction_type == vect_step_op_neg)
9750         inside_cost = 0;
9751
9752       /* prologue cost for vec_init and vec_step.  */
9753       prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9754                                         stmt_info, 0, vect_prologue);
9755
9756       if (dump_enabled_p ())
9757         dump_printf_loc (MSG_NOTE, vect_location,
9758                          "vect_model_induction_cost: inside_cost = %d, "
9759                          "prologue_cost = %d. \n", inside_cost,
9760                          prologue_cost);
9761
9762       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9763       DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
9764       return true;
9765     }
9766
9767   /* Transform.  */
9768
9769   /* Compute a vector variable, initialized with the first VF values of
9770      the induction variable.  E.g., for an iv with IV_PHI='X' and
9771      evolution S, for a vector of 4 units, we want to compute:
9772      [X, X + S, X + 2*S, X + 3*S].  */
9773
9774   if (dump_enabled_p ())
9775     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9776
9777   pe = loop_preheader_edge (iv_loop);
9778   /* Find the first insertion point in the BB.  */
9779   basic_block bb = gimple_bb (phi);
9780   si = gsi_after_labels (bb);
9781
9782   gimple_seq stmts = NULL;
9783
9784   niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9785   /* If we are using the loop mask to "peel" for alignment then we need
9786      to adjust the start value here.  */
9787   if (niters_skip != NULL_TREE)
9788     init_expr = vect_peel_nonlinear_iv_init (&stmts, init_expr, niters_skip,
9789                                              step_expr, induction_type);
9790
9791   vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr,
9792                                             step_expr, nunits, vectype,
9793                                             induction_type);
9794   if (stmts)
9795     {
9796       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9797       gcc_assert (!new_bb);
9798     }
9799
9800   stmts = NULL;
9801   new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9802                                             vf, induction_type);
9803   if (stmts)
9804     {
9805       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9806       gcc_assert (!new_bb);
9807     }
9808
9809   vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9810                                                 new_name, vectype,
9811                                                 induction_type);
9812   /* Create the following def-use cycle:
9813      loop prolog:
9814      vec_init = ...
9815      vec_step = ...
9816      loop:
9817      vec_iv = PHI <vec_init, vec_loop>
9818      ...
9819      STMT
9820      ...
9821      vec_loop = vec_iv + vec_step;  */
9822
9823   /* Create the induction-phi that defines the induction-operand.  */
9824   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
9825   induction_phi = create_phi_node (vec_dest, iv_loop->header);
9826   induc_def = PHI_RESULT (induction_phi);
9827
9828   /* Create the iv update inside the loop.  */
9829   stmts = NULL;
9830   vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9831                                       induc_def, vec_step,
9832                                       induction_type);
9833
9834   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9835   new_stmt = SSA_NAME_DEF_STMT (vec_def);
9836
9837   /* Set the arguments of the phi node:  */
9838   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9839   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9840                UNKNOWN_LOCATION);
9841
9842   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
9843   *vec_stmt = induction_phi;
9844
9845   /* In case that vectorization factor (VF) is bigger than the number
9846      of elements that we can fit in a vectype (nunits), we have to generate
9847      more than one vector stmt - i.e - we need to "unroll" the
9848      vector stmt by a factor VF/nunits.  For more details see documentation
9849      in vectorizable_operation.  */
9850
9851   if (ncopies > 1)
9852     {
9853       stmts = NULL;
9854       /* FORNOW. This restriction should be relaxed.  */
9855       gcc_assert (!nested_in_vect_loop);
9856
9857       new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9858                                                 nunits, induction_type);
9859
9860       vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9861                                                     new_name, vectype,
9862                                                     induction_type);
9863       vec_def = induc_def;
9864       for (i = 1; i < ncopies; i++)
9865         {
9866           /* vec_i = vec_prev + vec_step.  */
9867           stmts = NULL;
9868           vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9869                                               vec_def, vec_step,
9870                                               induction_type);
9871           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9872           new_stmt = SSA_NAME_DEF_STMT (vec_def);
9873           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
9874         }
9875     }
9876
9877   if (dump_enabled_p ())
9878     dump_printf_loc (MSG_NOTE, vect_location,
9879                      "transform induction: created def-use cycle: %G%G",
9880                      (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
9881
9882   return true;
9883 }
9884
9885 /* Function vectorizable_induction
9886
9887    Check if STMT_INFO performs an induction computation that can be vectorized.
9888    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
9889    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
9890    Return true if STMT_INFO is vectorizable in this way.  */
9891
9892 bool
9893 vectorizable_induction (loop_vec_info loop_vinfo,
9894                         stmt_vec_info stmt_info,
9895                         gimple **vec_stmt, slp_tree slp_node,
9896                         stmt_vector_for_cost *cost_vec)
9897 {
9898   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9899   unsigned ncopies;
9900   bool nested_in_vect_loop = false;
9901   class loop *iv_loop;
9902   tree vec_def;
9903   edge pe = loop_preheader_edge (loop);
9904   basic_block new_bb;
9905   tree new_vec, vec_init, vec_step, t;
9906   tree new_name;
9907   gimple *new_stmt;
9908   gphi *induction_phi;
9909   tree induc_def, vec_dest;
9910   tree init_expr, step_expr;
9911   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9912   unsigned i;
9913   tree expr;
9914   gimple_stmt_iterator si;
9915   enum vect_induction_op_type induction_type
9916     = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9917
9918   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9919   if (!phi)
9920     return false;
9921
9922   if (!STMT_VINFO_RELEVANT_P (stmt_info))
9923     return false;
9924
9925   /* Make sure it was recognized as induction computation.  */
9926   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
9927     return false;
9928
9929   /* Handle nonlinear induction in a separate place.  */
9930   if (induction_type != vect_step_op_add)
9931     return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
9932                                              vec_stmt, slp_node, cost_vec);
9933
9934   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9935   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9936
9937   if (slp_node)
9938     ncopies = 1;
9939   else
9940     ncopies = vect_get_num_copies (loop_vinfo, vectype);
9941   gcc_assert (ncopies >= 1);
9942
9943   /* FORNOW. These restrictions should be relaxed.  */
9944   if (nested_in_vect_loop_p (loop, stmt_info))
9945     {
9946       imm_use_iterator imm_iter;
9947       use_operand_p use_p;
9948       gimple *exit_phi;
9949       edge latch_e;
9950       tree loop_arg;
9951
9952       if (ncopies > 1)
9953         {
9954           if (dump_enabled_p ())
9955             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9956                              "multiple types in nested loop.\n");
9957           return false;
9958         }
9959
9960       exit_phi = NULL;
9961       latch_e = loop_latch_edge (loop->inner);
9962       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
9963       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
9964         {
9965           gimple *use_stmt = USE_STMT (use_p);
9966           if (is_gimple_debug (use_stmt))
9967             continue;
9968
9969           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
9970             {
9971               exit_phi = use_stmt;
9972               break;
9973             }
9974         }
9975       if (exit_phi)
9976         {
9977           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
9978           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
9979                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
9980             {
9981               if (dump_enabled_p ())
9982                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9983                                  "inner-loop induction only used outside "
9984                                  "of the outer vectorized loop.\n");
9985               return false;
9986             }
9987         }
9988
9989       nested_in_vect_loop = true;
9990       iv_loop = loop->inner;
9991     }
9992   else
9993     iv_loop = loop;
9994   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9995
9996   if (slp_node && !nunits.is_constant ())
9997     {
9998       /* The current SLP code creates the step value element-by-element.  */
9999       if (dump_enabled_p ())
10000         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10001                          "SLP induction not supported for variable-length"
10002                          " vectors.\n");
10003       return false;
10004     }
10005
10006   if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
10007     {
10008       if (dump_enabled_p ())
10009         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10010                          "floating point induction vectorization disabled\n");
10011       return false;
10012     }
10013
10014   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
10015   gcc_assert (step_expr != NULL_TREE);
10016   tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
10017
10018   /* Check for backend support of PLUS/MINUS_EXPR. */
10019   if (!directly_supported_p (PLUS_EXPR, step_vectype)
10020       || !directly_supported_p (MINUS_EXPR, step_vectype))
10021     return false;
10022
10023   if (!vec_stmt) /* transformation not required.  */
10024     {
10025       unsigned inside_cost = 0, prologue_cost = 0;
10026       if (slp_node)
10027         {
10028           /* We eventually need to set a vector type on invariant
10029              arguments.  */
10030           unsigned j;
10031           slp_tree child;
10032           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
10033             if (!vect_maybe_update_slp_op_vectype
10034                 (child, SLP_TREE_VECTYPE (slp_node)))
10035               {
10036                 if (dump_enabled_p ())
10037                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10038                                    "incompatible vector types for "
10039                                    "invariants\n");
10040                 return false;
10041               }
10042           /* loop cost for vec_loop.  */
10043           inside_cost
10044             = record_stmt_cost (cost_vec,
10045                                 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
10046                                 vector_stmt, stmt_info, 0, vect_body);
10047           /* prologue cost for vec_init (if not nested) and step.  */
10048           prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
10049                                             scalar_to_vec,
10050                                             stmt_info, 0, vect_prologue);
10051         }
10052       else /* if (!slp_node) */
10053         {
10054           /* loop cost for vec_loop.  */
10055           inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
10056                                           stmt_info, 0, vect_body);
10057           /* prologue cost for vec_init and vec_step.  */
10058           prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
10059                                             stmt_info, 0, vect_prologue);
10060         }
10061       if (dump_enabled_p ())
10062         dump_printf_loc (MSG_NOTE, vect_location,
10063                          "vect_model_induction_cost: inside_cost = %d, "
10064                          "prologue_cost = %d .\n", inside_cost,
10065                          prologue_cost);
10066
10067       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
10068       DUMP_VECT_SCOPE ("vectorizable_induction");
10069       return true;
10070     }
10071
10072   /* Transform.  */
10073
10074   /* Compute a vector variable, initialized with the first VF values of
10075      the induction variable.  E.g., for an iv with IV_PHI='X' and
10076      evolution S, for a vector of 4 units, we want to compute:
10077      [X, X + S, X + 2*S, X + 3*S].  */
10078
10079   if (dump_enabled_p ())
10080     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
10081
10082   pe = loop_preheader_edge (iv_loop);
10083   /* Find the first insertion point in the BB.  */
10084   basic_block bb = gimple_bb (phi);
10085   si = gsi_after_labels (bb);
10086
10087   /* For SLP induction we have to generate several IVs as for example
10088      with group size 3 we need
10089        [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
10090        [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2].  */
10091   if (slp_node)
10092     {
10093       /* Enforced above.  */
10094       unsigned int const_nunits = nunits.to_constant ();
10095
10096       /* The initial values are vectorized, but any lanes > group_size
10097          need adjustment.  */
10098       slp_tree init_node
10099         = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
10100
10101       /* Gather steps.  Since we do not vectorize inductions as
10102          cycles we have to reconstruct the step from SCEV data.  */
10103       unsigned group_size = SLP_TREE_LANES (slp_node);
10104       tree *steps = XALLOCAVEC (tree, group_size);
10105       tree *inits = XALLOCAVEC (tree, group_size);
10106       stmt_vec_info phi_info;
10107       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
10108         {
10109           steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
10110           if (!init_node)
10111             inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
10112                                            pe->dest_idx);
10113         }
10114
10115       /* Now generate the IVs.  */
10116       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10117       gcc_assert ((const_nunits * nvects) % group_size == 0);
10118       unsigned nivs;
10119       if (nested_in_vect_loop)
10120         nivs = nvects;
10121       else
10122         {
10123           /* Compute the number of distinct IVs we need.  First reduce
10124              group_size if it is a multiple of const_nunits so we get
10125              one IV for a group_size of 4 but const_nunits 2.  */
10126           unsigned group_sizep = group_size;
10127           if (group_sizep % const_nunits == 0)
10128             group_sizep = group_sizep / const_nunits;
10129           nivs = least_common_multiple (group_sizep,
10130                                         const_nunits) / const_nunits;
10131         }
10132       tree stept = TREE_TYPE (step_vectype);
10133       tree lupdate_mul = NULL_TREE;
10134       if (!nested_in_vect_loop)
10135         {
10136           /* The number of iterations covered in one vector iteration.  */
10137           unsigned lup_mul = (nvects * const_nunits) / group_size;
10138           lupdate_mul
10139             = build_vector_from_val (step_vectype,
10140                                      SCALAR_FLOAT_TYPE_P (stept)
10141                                      ? build_real_from_wide (stept, lup_mul,
10142                                                              UNSIGNED)
10143                                      : build_int_cstu (stept, lup_mul));
10144         }
10145       tree peel_mul = NULL_TREE;
10146       gimple_seq init_stmts = NULL;
10147       if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
10148         {
10149           if (SCALAR_FLOAT_TYPE_P (stept))
10150             peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
10151                                      LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
10152           else
10153             peel_mul = gimple_convert (&init_stmts, stept,
10154                                        LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
10155           peel_mul = gimple_build_vector_from_val (&init_stmts,
10156                                                    step_vectype, peel_mul);
10157         }
10158       unsigned ivn;
10159       auto_vec<tree> vec_steps;
10160       for (ivn = 0; ivn < nivs; ++ivn)
10161         {
10162           tree_vector_builder step_elts (step_vectype, const_nunits, 1);
10163           tree_vector_builder init_elts (vectype, const_nunits, 1);
10164           tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
10165           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
10166             {
10167               /* The scalar steps of the IVs.  */
10168               tree elt = steps[(ivn*const_nunits + eltn) % group_size];
10169               elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
10170               step_elts.quick_push (elt);
10171               if (!init_node)
10172                 {
10173                   /* The scalar inits of the IVs if not vectorized.  */
10174                   elt = inits[(ivn*const_nunits + eltn) % group_size];
10175                   if (!useless_type_conversion_p (TREE_TYPE (vectype),
10176                                                   TREE_TYPE (elt)))
10177                     elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
10178                                         TREE_TYPE (vectype), elt);
10179                   init_elts.quick_push (elt);
10180                 }
10181               /* The number of steps to add to the initial values.  */
10182               unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
10183               mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
10184                                    ? build_real_from_wide (stept,
10185                                                            mul_elt, UNSIGNED)
10186                                    : build_int_cstu (stept, mul_elt));
10187             }
10188           vec_step = gimple_build_vector (&init_stmts, &step_elts);
10189           vec_steps.safe_push (vec_step);
10190           tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
10191           if (peel_mul)
10192             step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
10193                                      step_mul, peel_mul);
10194           if (!init_node)
10195             vec_init = gimple_build_vector (&init_stmts, &init_elts);
10196
10197           /* Create the induction-phi that defines the induction-operand.  */
10198           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
10199                                             "vec_iv_");
10200           induction_phi = create_phi_node (vec_dest, iv_loop->header);
10201           induc_def = PHI_RESULT (induction_phi);
10202
10203           /* Create the iv update inside the loop  */
10204           tree up = vec_step;
10205           if (lupdate_mul)
10206             up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10207                                vec_step, lupdate_mul);
10208           gimple_seq stmts = NULL;
10209           vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10210           vec_def = gimple_build (&stmts,
10211                                   PLUS_EXPR, step_vectype, vec_def, up);
10212           vec_def = gimple_convert (&stmts, vectype, vec_def);
10213           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10214           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10215                        UNKNOWN_LOCATION);
10216
10217           if (init_node)
10218             vec_init = vect_get_slp_vect_def (init_node, ivn);
10219           if (!nested_in_vect_loop
10220               && !integer_zerop (step_mul))
10221             {
10222               vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
10223               up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10224                                  vec_step, step_mul);
10225               vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
10226                                       vec_def, up);
10227               vec_init = gimple_convert (&init_stmts, vectype, vec_def);
10228             }
10229
10230           /* Set the arguments of the phi node:  */
10231           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10232
10233           slp_node->push_vec_def (induction_phi);
10234         }
10235       if (!nested_in_vect_loop)
10236         {
10237           /* Fill up to the number of vectors we need for the whole group.  */
10238           nivs = least_common_multiple (group_size,
10239                                         const_nunits) / const_nunits;
10240           vec_steps.reserve (nivs-ivn);
10241           for (; ivn < nivs; ++ivn)
10242             {
10243               slp_node->push_vec_def (SLP_TREE_VEC_DEFS (slp_node)[0]);
10244               vec_steps.quick_push (vec_steps[0]);
10245             }
10246         }
10247
10248       /* Re-use IVs when we can.  We are generating further vector
10249          stmts by adding VF' * stride to the IVs generated above.  */
10250       if (ivn < nvects)
10251         {
10252           unsigned vfp
10253             = least_common_multiple (group_size, const_nunits) / group_size;
10254           tree lupdate_mul
10255             = build_vector_from_val (step_vectype,
10256                                      SCALAR_FLOAT_TYPE_P (stept)
10257                                      ? build_real_from_wide (stept,
10258                                                              vfp, UNSIGNED)
10259                                      : build_int_cstu (stept, vfp));
10260           for (; ivn < nvects; ++ivn)
10261             {
10262               gimple *iv
10263                 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (slp_node)[ivn - nivs]);
10264               tree def = gimple_get_lhs (iv);
10265               if (ivn < 2*nivs)
10266                 vec_steps[ivn - nivs]
10267                   = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10268                                   vec_steps[ivn - nivs], lupdate_mul);
10269               gimple_seq stmts = NULL;
10270               def = gimple_convert (&stmts, step_vectype, def);
10271               def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10272                                   def, vec_steps[ivn % nivs]);
10273               def = gimple_convert (&stmts, vectype, def);
10274               if (gimple_code (iv) == GIMPLE_PHI)
10275                 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10276               else
10277                 {
10278                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
10279                   gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
10280                 }
10281               slp_node->push_vec_def (def);
10282             }
10283         }
10284
10285       new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
10286       gcc_assert (!new_bb);
10287
10288       return true;
10289     }
10290
10291   init_expr = vect_phi_initial_value (phi);
10292
10293   gimple_seq stmts = NULL;
10294   if (!nested_in_vect_loop)
10295     {
10296       /* Convert the initial value to the IV update type.  */
10297       tree new_type = TREE_TYPE (step_expr);
10298       init_expr = gimple_convert (&stmts, new_type, init_expr);
10299
10300       /* If we are using the loop mask to "peel" for alignment then we need
10301          to adjust the start value here.  */
10302       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
10303       if (skip_niters != NULL_TREE)
10304         {
10305           if (FLOAT_TYPE_P (vectype))
10306             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
10307                                         skip_niters);
10308           else
10309             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
10310           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
10311                                          skip_niters, step_expr);
10312           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
10313                                     init_expr, skip_step);
10314         }
10315     }
10316
10317   if (stmts)
10318     {
10319       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10320       gcc_assert (!new_bb);
10321     }
10322
10323   /* Create the vector that holds the initial_value of the induction.  */
10324   if (nested_in_vect_loop)
10325     {
10326       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
10327          been created during vectorization of previous stmts.  We obtain it
10328          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
10329       auto_vec<tree> vec_inits;
10330       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
10331                                      init_expr, &vec_inits);
10332       vec_init = vec_inits[0];
10333       /* If the initial value is not of proper type, convert it.  */
10334       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
10335         {
10336           new_stmt
10337             = gimple_build_assign (vect_get_new_ssa_name (vectype,
10338                                                           vect_simple_var,
10339                                                           "vec_iv_"),
10340                                    VIEW_CONVERT_EXPR,
10341                                    build1 (VIEW_CONVERT_EXPR, vectype,
10342                                            vec_init));
10343           vec_init = gimple_assign_lhs (new_stmt);
10344           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
10345                                                  new_stmt);
10346           gcc_assert (!new_bb);
10347         }
10348     }
10349   else
10350     {
10351       /* iv_loop is the loop to be vectorized. Create:
10352          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
10353       stmts = NULL;
10354       new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
10355
10356       unsigned HOST_WIDE_INT const_nunits;
10357       if (nunits.is_constant (&const_nunits))
10358         {
10359           tree_vector_builder elts (step_vectype, const_nunits, 1);
10360           elts.quick_push (new_name);
10361           for (i = 1; i < const_nunits; i++)
10362             {
10363               /* Create: new_name_i = new_name + step_expr  */
10364               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
10365                                        new_name, step_expr);
10366               elts.quick_push (new_name);
10367             }
10368           /* Create a vector from [new_name_0, new_name_1, ...,
10369              new_name_nunits-1]  */
10370           vec_init = gimple_build_vector (&stmts, &elts);
10371         }
10372       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
10373         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
10374         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
10375                                  new_name, step_expr);
10376       else
10377         {
10378           /* Build:
10379                 [base, base, base, ...]
10380                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
10381           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
10382           gcc_assert (flag_associative_math);
10383           tree index = build_index_vector (step_vectype, 0, 1);
10384           tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10385                                                         new_name);
10386           tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10387                                                         step_expr);
10388           vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
10389           vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
10390                                    vec_init, step_vec);
10391           vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10392                                    vec_init, base_vec);
10393         }
10394       vec_init = gimple_convert (&stmts, vectype, vec_init);
10395
10396       if (stmts)
10397         {
10398           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10399           gcc_assert (!new_bb);
10400         }
10401     }
10402
10403
10404   /* Create the vector that holds the step of the induction.  */
10405   gimple_stmt_iterator *step_iv_si = NULL;
10406   if (nested_in_vect_loop)
10407     /* iv_loop is nested in the loop to be vectorized. Generate:
10408        vec_step = [S, S, S, S]  */
10409     new_name = step_expr;
10410   else if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
10411     {
10412       /* When we're using loop_len produced by SELEC_VL, the non-final
10413          iterations are not always processing VF elements.  So vectorize
10414          induction variable instead of
10415
10416            _21 = vect_vec_iv_.6_22 + { VF, ... };
10417
10418          We should generate:
10419
10420            _35 = .SELECT_VL (ivtmp_33, VF);
10421            vect_cst__22 = [vec_duplicate_expr] _35;
10422            _21 = vect_vec_iv_.6_22 + vect_cst__22;  */
10423       gcc_assert (!slp_node);
10424       gimple_seq seq = NULL;
10425       vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
10426       tree len = vect_get_loop_len (loop_vinfo, NULL, lens, 1, vectype, 0, 0);
10427       expr = force_gimple_operand (fold_convert (TREE_TYPE (step_expr),
10428                                                  unshare_expr (len)),
10429                                    &seq, true, NULL_TREE);
10430       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr), expr,
10431                                step_expr);
10432       gsi_insert_seq_before (&si, seq, GSI_SAME_STMT);
10433       step_iv_si = &si;
10434     }
10435   else
10436     {
10437       /* iv_loop is the loop to be vectorized. Generate:
10438           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
10439       gimple_seq seq = NULL;
10440       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10441         {
10442           expr = build_int_cst (integer_type_node, vf);
10443           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10444         }
10445       else
10446         expr = build_int_cst (TREE_TYPE (step_expr), vf);
10447       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10448                                expr, step_expr);
10449       if (seq)
10450         {
10451           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10452           gcc_assert (!new_bb);
10453         }
10454     }
10455
10456   t = unshare_expr (new_name);
10457   gcc_assert (CONSTANT_CLASS_P (new_name)
10458               || TREE_CODE (new_name) == SSA_NAME);
10459   new_vec = build_vector_from_val (step_vectype, t);
10460   vec_step = vect_init_vector (loop_vinfo, stmt_info,
10461                                new_vec, step_vectype, step_iv_si);
10462
10463
10464   /* Create the following def-use cycle:
10465      loop prolog:
10466          vec_init = ...
10467          vec_step = ...
10468      loop:
10469          vec_iv = PHI <vec_init, vec_loop>
10470          ...
10471          STMT
10472          ...
10473          vec_loop = vec_iv + vec_step;  */
10474
10475   /* Create the induction-phi that defines the induction-operand.  */
10476   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
10477   induction_phi = create_phi_node (vec_dest, iv_loop->header);
10478   induc_def = PHI_RESULT (induction_phi);
10479
10480   /* Create the iv update inside the loop  */
10481   stmts = NULL;
10482   vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10483   vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
10484   vec_def = gimple_convert (&stmts, vectype, vec_def);
10485   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10486   new_stmt = SSA_NAME_DEF_STMT (vec_def);
10487
10488   /* Set the arguments of the phi node:  */
10489   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10490   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10491                UNKNOWN_LOCATION);
10492
10493   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
10494   *vec_stmt = induction_phi;
10495
10496   /* In case that vectorization factor (VF) is bigger than the number
10497      of elements that we can fit in a vectype (nunits), we have to generate
10498      more than one vector stmt - i.e - we need to "unroll" the
10499      vector stmt by a factor VF/nunits.  For more details see documentation
10500      in vectorizable_operation.  */
10501
10502   if (ncopies > 1)
10503     {
10504       gimple_seq seq = NULL;
10505       /* FORNOW. This restriction should be relaxed.  */
10506       gcc_assert (!nested_in_vect_loop);
10507       /* We expect LOOP_VINFO_USING_SELECT_VL_P to be false if ncopies > 1.  */
10508       gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
10509
10510       /* Create the vector that holds the step of the induction.  */
10511       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10512         {
10513           expr = build_int_cst (integer_type_node, nunits);
10514           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10515         }
10516       else
10517         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
10518       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10519                                expr, step_expr);
10520       if (seq)
10521         {
10522           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10523           gcc_assert (!new_bb);
10524         }
10525
10526       t = unshare_expr (new_name);
10527       gcc_assert (CONSTANT_CLASS_P (new_name)
10528                   || TREE_CODE (new_name) == SSA_NAME);
10529       new_vec = build_vector_from_val (step_vectype, t);
10530       vec_step = vect_init_vector (loop_vinfo, stmt_info,
10531                                    new_vec, step_vectype, NULL);
10532
10533       vec_def = induc_def;
10534       for (i = 1; i < ncopies + 1; i++)
10535         {
10536           /* vec_i = vec_prev + vec_step  */
10537           gimple_seq stmts = NULL;
10538           vec_def = gimple_convert (&stmts, step_vectype, vec_def);
10539           vec_def = gimple_build (&stmts,
10540                                   PLUS_EXPR, step_vectype, vec_def, vec_step);
10541           vec_def = gimple_convert (&stmts, vectype, vec_def);
10542
10543           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10544           if (i < ncopies)
10545             {
10546               new_stmt = SSA_NAME_DEF_STMT (vec_def);
10547               STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
10548             }
10549           else
10550             {
10551               /* vec_1 = vec_iv + (VF/n * S)
10552                  vec_2 = vec_1 + (VF/n * S)
10553                  ...
10554                  vec_n = vec_prev + (VF/n * S) = vec_iv + VF * S = vec_loop
10555
10556                  vec_n is used as vec_loop to save the large step register and
10557                  related operations.  */
10558               add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10559                            UNKNOWN_LOCATION);
10560             }
10561         }
10562     }
10563
10564   if (dump_enabled_p ())
10565     dump_printf_loc (MSG_NOTE, vect_location,
10566                      "transform induction: created def-use cycle: %G%G",
10567                      (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
10568
10569   return true;
10570 }
10571
10572 /* Function vectorizable_live_operation_1.
10573
10574    helper function for vectorizable_live_operation.  */
10575
10576 tree
10577 vectorizable_live_operation_1 (loop_vec_info loop_vinfo,
10578                                stmt_vec_info stmt_info, basic_block exit_bb,
10579                                tree vectype, int ncopies, slp_tree slp_node,
10580                                tree bitsize, tree bitstart, tree vec_lhs,
10581                                tree lhs_type, bool restart_loop,
10582                                gimple_stmt_iterator *exit_gsi)
10583 {
10584   gcc_assert (single_pred_p (exit_bb) || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
10585
10586   tree vec_lhs_phi = copy_ssa_name (vec_lhs);
10587   gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
10588   for (unsigned i = 0; i < gimple_phi_num_args (phi); i++)
10589     SET_PHI_ARG_DEF (phi, i, vec_lhs);
10590
10591   gimple_seq stmts = NULL;
10592   tree new_tree;
10593   if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
10594     {
10595       /* Emit:
10596
10597          SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
10598
10599          where VEC_LHS is the vectorized live-out result and MASK is
10600          the loop mask for the final iteration.  */
10601       gcc_assert (ncopies == 1 && !slp_node);
10602       gimple_seq tem = NULL;
10603       gimple_stmt_iterator gsi = gsi_last (tem);
10604       tree len = vect_get_loop_len (loop_vinfo, &gsi,
10605                                     &LOOP_VINFO_LENS (loop_vinfo),
10606                                     1, vectype, 0, 0);
10607
10608       /* BIAS - 1.  */
10609       signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
10610       tree bias_minus_one
10611         = int_const_binop (MINUS_EXPR,
10612                            build_int_cst (TREE_TYPE (len), biasval),
10613                            build_one_cst (TREE_TYPE (len)));
10614
10615       /* LAST_INDEX = LEN + (BIAS - 1).  */
10616       tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
10617                                      len, bias_minus_one);
10618
10619       /* This needs to implement extraction of the first index, but not sure
10620          how the LEN stuff works.  At the moment we shouldn't get here since
10621          there's no LEN support for early breaks.  But guard this so there's
10622          no incorrect codegen.  */
10623       gcc_assert (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
10624
10625       /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>.  */
10626       tree scalar_res
10627         = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
10628                         vec_lhs_phi, last_index);
10629
10630       /* Convert the extracted vector element to the scalar type.  */
10631       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10632     }
10633   else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
10634     {
10635       /* Emit:
10636
10637          SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
10638
10639          where VEC_LHS is the vectorized live-out result and MASK is
10640          the loop mask for the final iteration.  */
10641       gcc_assert (!slp_node);
10642       tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
10643       gimple_seq tem = NULL;
10644       gimple_stmt_iterator gsi = gsi_last (tem);
10645       tree mask = vect_get_loop_mask (loop_vinfo, &gsi,
10646                                       &LOOP_VINFO_MASKS (loop_vinfo),
10647                                       1, vectype, 0);
10648       tree scalar_res;
10649
10650       /* For an inverted control flow with early breaks we want EXTRACT_FIRST
10651          instead of EXTRACT_LAST.  Emulate by reversing the vector and mask. */
10652       if (restart_loop && LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
10653         {
10654           /* First create the permuted mask.  */
10655           tree perm_mask = perm_mask_for_reverse (TREE_TYPE (mask));
10656           tree perm_dest = copy_ssa_name (mask);
10657           gimple *perm_stmt
10658                 = gimple_build_assign (perm_dest, VEC_PERM_EXPR, mask,
10659                                        mask, perm_mask);
10660           vect_finish_stmt_generation (loop_vinfo, stmt_info, perm_stmt,
10661                                        &gsi);
10662           mask = perm_dest;
10663
10664           /* Then permute the vector contents.  */
10665           tree perm_elem = perm_mask_for_reverse (vectype);
10666           perm_dest = copy_ssa_name (vec_lhs_phi);
10667           perm_stmt
10668                 = gimple_build_assign (perm_dest, VEC_PERM_EXPR, vec_lhs_phi,
10669                                        vec_lhs_phi, perm_elem);
10670           vect_finish_stmt_generation (loop_vinfo, stmt_info, perm_stmt,
10671                                        &gsi);
10672           vec_lhs_phi = perm_dest;
10673         }
10674
10675       gimple_seq_add_seq (&stmts, tem);
10676
10677       scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
10678                                  mask, vec_lhs_phi);
10679
10680       /* Convert the extracted vector element to the scalar type.  */
10681       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10682     }
10683   else
10684     {
10685       tree bftype = TREE_TYPE (vectype);
10686       if (VECTOR_BOOLEAN_TYPE_P (vectype))
10687         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10688       new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs_phi, bitsize, bitstart);
10689       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10690                                        &stmts, true, NULL_TREE);
10691     }
10692
10693   *exit_gsi = gsi_after_labels (exit_bb);
10694   if (stmts)
10695     gsi_insert_seq_before (exit_gsi, stmts, GSI_SAME_STMT);
10696
10697   return new_tree;
10698 }
10699
10700 /* Find the edge that's the final one in the path from SRC to DEST and
10701    return it.  This edge must exist in at most one forwarder edge between.  */
10702
10703 static edge
10704 find_connected_edge (edge src, basic_block dest)
10705 {
10706    if (src->dest == dest)
10707      return src;
10708
10709   return find_edge (src->dest, dest);
10710 }
10711
10712 /* Function vectorizable_live_operation.
10713
10714    STMT_INFO computes a value that is used outside the loop.  Check if
10715    it can be supported.  */
10716
10717 bool
10718 vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
10719                              slp_tree slp_node, slp_instance slp_node_instance,
10720                              int slp_index, bool vec_stmt_p,
10721                              stmt_vector_for_cost *cost_vec)
10722 {
10723   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
10724   imm_use_iterator imm_iter;
10725   tree lhs, lhs_type, bitsize;
10726   tree vectype = (slp_node
10727                   ? SLP_TREE_VECTYPE (slp_node)
10728                   : STMT_VINFO_VECTYPE (stmt_info));
10729   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10730   int ncopies;
10731   gimple *use_stmt;
10732   use_operand_p use_p;
10733   auto_vec<tree> vec_oprnds;
10734   int vec_entry = 0;
10735   poly_uint64 vec_index = 0;
10736
10737   gcc_assert (STMT_VINFO_LIVE_P (stmt_info)
10738               || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
10739
10740   /* If a stmt of a reduction is live, vectorize it via
10741      vect_create_epilog_for_reduction.  vectorizable_reduction assessed
10742      validity so just trigger the transform here.  */
10743   if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
10744     {
10745       if (!vec_stmt_p)
10746         return true;
10747       if (slp_node)
10748         {
10749           /* For reduction chains the meta-info is attached to
10750              the group leader.  */
10751           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
10752             stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
10753           /* For SLP reductions we vectorize the epilogue for
10754              all involved stmts together.  */
10755           else if (slp_index != 0)
10756             return true;
10757         }
10758       stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
10759       gcc_assert (reduc_info->is_reduc_info);
10760       if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
10761           || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
10762         return true;
10763
10764       vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
10765                                         slp_node_instance,
10766                                         LOOP_VINFO_IV_EXIT (loop_vinfo));
10767
10768       /* If early break we only have to materialize the reduction on the merge
10769          block, but we have to find an alternate exit first.  */
10770       if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
10771         {
10772           for (auto exit : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
10773             if (exit != LOOP_VINFO_IV_EXIT (loop_vinfo))
10774               {
10775                 vect_create_epilog_for_reduction (loop_vinfo, stmt_info,
10776                                                   slp_node, slp_node_instance,
10777                                                   exit);
10778                 break;
10779               }
10780         }
10781
10782       return true;
10783     }
10784
10785   /* If STMT is not relevant and it is a simple assignment and its inputs are
10786      invariant then it can remain in place, unvectorized.  The original last
10787      scalar value that it computes will be used.  */
10788   if (!STMT_VINFO_RELEVANT_P (stmt_info))
10789     {
10790       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
10791       if (dump_enabled_p ())
10792         dump_printf_loc (MSG_NOTE, vect_location,
10793                          "statement is simple and uses invariant.  Leaving in "
10794                          "place.\n");
10795       return true;
10796     }
10797
10798   if (slp_node)
10799     ncopies = 1;
10800   else
10801     ncopies = vect_get_num_copies (loop_vinfo, vectype);
10802
10803   if (slp_node)
10804     {
10805       gcc_assert (slp_index >= 0);
10806
10807       /* Get the last occurrence of the scalar index from the concatenation of
10808          all the slp vectors. Calculate which slp vector it is and the index
10809          within.  */
10810       int num_scalar = SLP_TREE_LANES (slp_node);
10811       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10812       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
10813
10814       /* Calculate which vector contains the result, and which lane of
10815          that vector we need.  */
10816       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
10817         {
10818           if (dump_enabled_p ())
10819             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10820                              "Cannot determine which vector holds the"
10821                              " final result.\n");
10822           return false;
10823         }
10824     }
10825
10826   if (!vec_stmt_p)
10827     {
10828       /* No transformation required.  */
10829       if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
10830         {
10831           if (slp_node)
10832             {
10833               if (dump_enabled_p ())
10834                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10835                                  "can't operate on partial vectors "
10836                                  "because an SLP statement is live after "
10837                                  "the loop.\n");
10838               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10839             }
10840           else if (ncopies > 1)
10841             {
10842               if (dump_enabled_p ())
10843                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10844                                  "can't operate on partial vectors "
10845                                  "because ncopies is greater than 1.\n");
10846               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10847             }
10848           else
10849             {
10850               gcc_assert (ncopies == 1 && !slp_node);
10851               if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
10852                                                   OPTIMIZE_FOR_SPEED))
10853                 vect_record_loop_mask (loop_vinfo,
10854                                        &LOOP_VINFO_MASKS (loop_vinfo),
10855                                        1, vectype, NULL);
10856               else if (can_vec_extract_var_idx_p (
10857                          TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
10858                 vect_record_loop_len (loop_vinfo,
10859                                       &LOOP_VINFO_LENS (loop_vinfo),
10860                                       1, vectype, 1);
10861               else
10862                 {
10863                   if (dump_enabled_p ())
10864                     dump_printf_loc (
10865                       MSG_MISSED_OPTIMIZATION, vect_location,
10866                       "can't operate on partial vectors "
10867                       "because the target doesn't support extract "
10868                       "last reduction.\n");
10869                   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10870                 }
10871             }
10872         }
10873       /* ???  Enable for loop costing as well.  */
10874       if (!loop_vinfo)
10875         record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
10876                           0, vect_epilogue);
10877       return true;
10878     }
10879
10880   /* Use the lhs of the original scalar statement.  */
10881   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
10882   if (dump_enabled_p ())
10883     dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
10884                      "stmt %G", stmt);
10885
10886   lhs = gimple_get_lhs (stmt);
10887   lhs_type = TREE_TYPE (lhs);
10888
10889   bitsize = vector_element_bits_tree (vectype);
10890
10891   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
10892   tree vec_lhs, vec_lhs0, bitstart;
10893   gimple *vec_stmt, *vec_stmt0;
10894   if (slp_node)
10895     {
10896       gcc_assert (!loop_vinfo
10897                   || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
10898                       && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)));
10899
10900       /* Get the correct slp vectorized stmt.  */
10901       vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
10902       vec_stmt = SSA_NAME_DEF_STMT (vec_lhs);
10903
10904       /* In case we need to early break vectorize also get the first stmt.  */
10905       vec_lhs0 = SLP_TREE_VEC_DEFS (slp_node)[0];
10906       vec_stmt0 = SSA_NAME_DEF_STMT (vec_lhs0);
10907
10908       /* Get entry to use.  */
10909       bitstart = bitsize_int (vec_index);
10910       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
10911     }
10912   else
10913     {
10914       /* For multiple copies, get the last copy.  */
10915       vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
10916       vec_lhs = gimple_get_lhs (vec_stmt);
10917
10918       /* In case we need to early break vectorize also get the first stmt.  */
10919       vec_stmt0 = STMT_VINFO_VEC_STMTS (stmt_info)[0];
10920       vec_lhs0 = gimple_get_lhs (vec_stmt0);
10921
10922       /* Get the last lane in the vector.  */
10923       bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
10924     }
10925
10926   if (loop_vinfo)
10927     {
10928       /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
10929          requirement, insert one phi node for it.  It looks like:
10930            loop;
10931          BB:
10932            # lhs' = PHI <lhs>
10933          ==>
10934            loop;
10935          BB:
10936            # vec_lhs' = PHI <vec_lhs>
10937            new_tree = lane_extract <vec_lhs', ...>;
10938            lhs' = new_tree;  */
10939
10940       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10941       /* Check if we have a loop where the chosen exit is not the main exit,
10942          in these cases for an early break we restart the iteration the vector code
10943          did.  For the live values we want the value at the start of the iteration
10944          rather than at the end.  */
10945       edge main_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
10946       bool restart_loop = LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo);
10947       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10948         if (!is_gimple_debug (use_stmt)
10949             && !flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
10950           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10951             {
10952               edge e = gimple_phi_arg_edge (as_a <gphi *> (use_stmt),
10953                                            phi_arg_index_from_use (use_p));
10954               bool main_exit_edge = e == main_e
10955                                     || find_connected_edge (main_e, e->src);
10956
10957               /* Early exits have an merge block, we want the merge block itself
10958                  so use ->src.  For main exit the merge block is the
10959                  destination.  */
10960               basic_block dest = main_exit_edge ? main_e->dest : e->src;
10961               tree tmp_vec_lhs = vec_lhs;
10962               tree tmp_bitstart = bitstart;
10963
10964               /* For early exit where the exit is not in the BB that leads
10965                  to the latch then we're restarting the iteration in the
10966                  scalar loop.  So get the first live value.  */
10967               restart_loop = restart_loop || !main_exit_edge;
10968               if (restart_loop
10969                   && STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
10970                 {
10971                   tmp_vec_lhs = vec_lhs0;
10972                   tmp_bitstart = build_zero_cst (TREE_TYPE (bitstart));
10973                 }
10974
10975               gimple_stmt_iterator exit_gsi;
10976               tree new_tree
10977                 = vectorizable_live_operation_1 (loop_vinfo, stmt_info,
10978                                                  dest, vectype, ncopies,
10979                                                  slp_node, bitsize,
10980                                                  tmp_bitstart, tmp_vec_lhs,
10981                                                  lhs_type, restart_loop,
10982                                                  &exit_gsi);
10983
10984               if (gimple_phi_num_args (use_stmt) == 1)
10985                 {
10986                   auto gsi = gsi_for_stmt (use_stmt);
10987                   remove_phi_node (&gsi, false);
10988                   tree lhs_phi = gimple_phi_result (use_stmt);
10989                   gimple *copy = gimple_build_assign (lhs_phi, new_tree);
10990                   gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
10991                 }
10992               else
10993                 SET_PHI_ARG_DEF (use_stmt, e->dest_idx, new_tree);
10994           }
10995
10996       /* There a no further out-of-loop uses of lhs by LC-SSA construction.  */
10997       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10998         gcc_assert (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)));
10999     }
11000   else
11001     {
11002       /* For basic-block vectorization simply insert the lane-extraction.  */
11003       tree bftype = TREE_TYPE (vectype);
11004       if (VECTOR_BOOLEAN_TYPE_P (vectype))
11005         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
11006       tree new_tree = build3 (BIT_FIELD_REF, bftype,
11007                               vec_lhs, bitsize, bitstart);
11008       gimple_seq stmts = NULL;
11009       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
11010                                        &stmts, true, NULL_TREE);
11011       if (TREE_CODE (new_tree) == SSA_NAME
11012           && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
11013         SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
11014       if (is_a <gphi *> (vec_stmt))
11015         {
11016           gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
11017           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
11018         }
11019       else
11020         {
11021           gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
11022           gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
11023         }
11024
11025       /* Replace use of lhs with newly computed result.  If the use stmt is a
11026          single arg PHI, just replace all uses of PHI result.  It's necessary
11027          because lcssa PHI defining lhs may be before newly inserted stmt.  */
11028       use_operand_p use_p;
11029       stmt_vec_info use_stmt_info;
11030       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
11031         if (!is_gimple_debug (use_stmt)
11032             && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
11033                 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
11034           {
11035             /* ???  This can happen when the live lane ends up being
11036                rooted in a vector construction code-generated by an
11037                external SLP node (and code-generation for that already
11038                happened).  See gcc.dg/vect/bb-slp-47.c.
11039                Doing this is what would happen if that vector CTOR
11040                were not code-generated yet so it is not too bad.
11041                ???  In fact we'd likely want to avoid this situation
11042                in the first place.  */
11043             if (TREE_CODE (new_tree) == SSA_NAME
11044                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
11045                 && gimple_code (use_stmt) != GIMPLE_PHI
11046                 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
11047                                                 use_stmt))
11048               {
11049                 if (dump_enabled_p ())
11050                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11051                                    "Using original scalar computation for "
11052                                    "live lane because use preceeds vector "
11053                                    "def\n");
11054                 continue;
11055               }
11056             /* ???  It can also happen that we end up pulling a def into
11057                a loop where replacing out-of-loop uses would require
11058                a new LC SSA PHI node.  Retain the original scalar in
11059                those cases as well.  PR98064.  */
11060             if (TREE_CODE (new_tree) == SSA_NAME
11061                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
11062                 && (gimple_bb (use_stmt)->loop_father
11063                     != gimple_bb (vec_stmt)->loop_father)
11064                 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
11065                                         gimple_bb (use_stmt)->loop_father))
11066               {
11067                 if (dump_enabled_p ())
11068                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11069                                    "Using original scalar computation for "
11070                                    "live lane because there is an out-of-loop "
11071                                    "definition for it\n");
11072                 continue;
11073               }
11074             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
11075               SET_USE (use_p, new_tree);
11076             update_stmt (use_stmt);
11077           }
11078     }
11079
11080   return true;
11081 }
11082
11083 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
11084
11085 static void
11086 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
11087 {
11088   ssa_op_iter op_iter;
11089   imm_use_iterator imm_iter;
11090   def_operand_p def_p;
11091   gimple *ustmt;
11092
11093   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
11094     {
11095       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
11096         {
11097           basic_block bb;
11098
11099           if (!is_gimple_debug (ustmt))
11100             continue;
11101
11102           bb = gimple_bb (ustmt);
11103
11104           if (!flow_bb_inside_loop_p (loop, bb))
11105             {
11106               if (gimple_debug_bind_p (ustmt))
11107                 {
11108                   if (dump_enabled_p ())
11109                     dump_printf_loc (MSG_NOTE, vect_location,
11110                                      "killing debug use\n");
11111
11112                   gimple_debug_bind_reset_value (ustmt);
11113                   update_stmt (ustmt);
11114                 }
11115               else
11116                 gcc_unreachable ();
11117             }
11118         }
11119     }
11120 }
11121
11122 /* Given loop represented by LOOP_VINFO, return true if computation of
11123    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
11124    otherwise.  */
11125
11126 static bool
11127 loop_niters_no_overflow (loop_vec_info loop_vinfo)
11128 {
11129   /* Constant case.  */
11130   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
11131     {
11132       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
11133       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
11134
11135       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
11136       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
11137       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
11138         return true;
11139     }
11140
11141   widest_int max;
11142   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11143   /* Check the upper bound of loop niters.  */
11144   if (get_max_loop_iterations (loop, &max))
11145     {
11146       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
11147       signop sgn = TYPE_SIGN (type);
11148       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
11149       if (max < type_max)
11150         return true;
11151     }
11152   return false;
11153 }
11154
11155 /* Return a mask type with half the number of elements as OLD_TYPE,
11156    given that it should have mode NEW_MODE.  */
11157
11158 tree
11159 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
11160 {
11161   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
11162   return build_truth_vector_type_for_mode (nunits, new_mode);
11163 }
11164
11165 /* Return a mask type with twice as many elements as OLD_TYPE,
11166    given that it should have mode NEW_MODE.  */
11167
11168 tree
11169 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
11170 {
11171   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
11172   return build_truth_vector_type_for_mode (nunits, new_mode);
11173 }
11174
11175 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
11176    contain a sequence of NVECTORS masks that each control a vector of type
11177    VECTYPE.  If SCALAR_MASK is nonnull, the fully-masked loop would AND
11178    these vector masks with the vector version of SCALAR_MASK.  */
11179
11180 void
11181 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
11182                        unsigned int nvectors, tree vectype, tree scalar_mask)
11183 {
11184   gcc_assert (nvectors != 0);
11185
11186   if (scalar_mask)
11187     {
11188       scalar_cond_masked_key cond (scalar_mask, nvectors);
11189       loop_vinfo->scalar_cond_masked_set.add (cond);
11190     }
11191
11192   masks->mask_set.add (std::make_pair (vectype, nvectors));
11193 }
11194
11195 /* Given a complete set of masks MASKS, extract mask number INDEX
11196    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
11197    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
11198
11199    See the comment above vec_loop_masks for more details about the mask
11200    arrangement.  */
11201
11202 tree
11203 vect_get_loop_mask (loop_vec_info loop_vinfo,
11204                     gimple_stmt_iterator *gsi, vec_loop_masks *masks,
11205                     unsigned int nvectors, tree vectype, unsigned int index)
11206 {
11207   if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
11208       == vect_partial_vectors_while_ult)
11209     {
11210       rgroup_controls *rgm = &(masks->rgc_vec)[nvectors - 1];
11211       tree mask_type = rgm->type;
11212
11213       /* Populate the rgroup's mask array, if this is the first time we've
11214          used it.  */
11215       if (rgm->controls.is_empty ())
11216         {
11217           rgm->controls.safe_grow_cleared (nvectors, true);
11218           for (unsigned int i = 0; i < nvectors; ++i)
11219             {
11220               tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
11221               /* Provide a dummy definition until the real one is available.  */
11222               SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
11223               rgm->controls[i] = mask;
11224             }
11225         }
11226
11227       tree mask = rgm->controls[index];
11228       if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
11229                     TYPE_VECTOR_SUBPARTS (vectype)))
11230         {
11231           /* A loop mask for data type X can be reused for data type Y
11232              if X has N times more elements than Y and if Y's elements
11233              are N times bigger than X's.  In this case each sequence
11234              of N elements in the loop mask will be all-zero or all-one.
11235              We can then view-convert the mask so that each sequence of
11236              N elements is replaced by a single element.  */
11237           gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
11238                                   TYPE_VECTOR_SUBPARTS (vectype)));
11239           gimple_seq seq = NULL;
11240           mask_type = truth_type_for (vectype);
11241           mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
11242           if (seq)
11243             gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11244         }
11245       return mask;
11246     }
11247   else if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
11248            == vect_partial_vectors_avx512)
11249     {
11250       /* The number of scalars per iteration and the number of vectors are
11251          both compile-time constants.  */
11252       unsigned int nscalars_per_iter
11253         = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11254                      LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
11255
11256       rgroup_controls *rgm = &masks->rgc_vec[nscalars_per_iter - 1];
11257
11258       /* The stored nV is dependent on the mask type produced.  */
11259       gcc_assert (exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11260                              TYPE_VECTOR_SUBPARTS (rgm->type)).to_constant ()
11261                   == rgm->factor);
11262       nvectors = rgm->factor;
11263
11264       /* Populate the rgroup's mask array, if this is the first time we've
11265          used it.  */
11266       if (rgm->controls.is_empty ())
11267         {
11268           rgm->controls.safe_grow_cleared (nvectors, true);
11269           for (unsigned int i = 0; i < nvectors; ++i)
11270             {
11271               tree mask = make_temp_ssa_name (rgm->type, NULL, "loop_mask");
11272               /* Provide a dummy definition until the real one is available.  */
11273               SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
11274               rgm->controls[i] = mask;
11275             }
11276         }
11277       if (known_eq (TYPE_VECTOR_SUBPARTS (rgm->type),
11278                     TYPE_VECTOR_SUBPARTS (vectype)))
11279         return rgm->controls[index];
11280
11281       /* Split the vector if needed.  Since we are dealing with integer mode
11282          masks with AVX512 we can operate on the integer representation
11283          performing the whole vector shifting.  */
11284       unsigned HOST_WIDE_INT factor;
11285       bool ok = constant_multiple_p (TYPE_VECTOR_SUBPARTS (rgm->type),
11286                                      TYPE_VECTOR_SUBPARTS (vectype), &factor);
11287       gcc_assert (ok);
11288       gcc_assert (GET_MODE_CLASS (TYPE_MODE (rgm->type)) == MODE_INT);
11289       tree mask_type = truth_type_for (vectype);
11290       gcc_assert (GET_MODE_CLASS (TYPE_MODE (mask_type)) == MODE_INT);
11291       unsigned vi = index / factor;
11292       unsigned vpart = index % factor;
11293       tree vec = rgm->controls[vi];
11294       gimple_seq seq = NULL;
11295       vec = gimple_build (&seq, VIEW_CONVERT_EXPR,
11296                           lang_hooks.types.type_for_mode
11297                                 (TYPE_MODE (rgm->type), 1), vec);
11298       /* For integer mode masks simply shift the right bits into position.  */
11299       if (vpart != 0)
11300         vec = gimple_build (&seq, RSHIFT_EXPR, TREE_TYPE (vec), vec,
11301                             build_int_cst (integer_type_node,
11302                                            (TYPE_VECTOR_SUBPARTS (vectype)
11303                                             * vpart)));
11304       vec = gimple_convert (&seq, lang_hooks.types.type_for_mode
11305                                     (TYPE_MODE (mask_type), 1), vec);
11306       vec = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, vec);
11307       if (seq)
11308         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11309       return vec;
11310     }
11311   else
11312     gcc_unreachable ();
11313 }
11314
11315 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
11316    lengths for controlling an operation on VECTYPE.  The operation splits
11317    each element of VECTYPE into FACTOR separate subelements, measuring the
11318    length as a number of these subelements.  */
11319
11320 void
11321 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
11322                       unsigned int nvectors, tree vectype, unsigned int factor)
11323 {
11324   gcc_assert (nvectors != 0);
11325   if (lens->length () < nvectors)
11326     lens->safe_grow_cleared (nvectors, true);
11327   rgroup_controls *rgl = &(*lens)[nvectors - 1];
11328
11329   /* The number of scalars per iteration, scalar occupied bytes and
11330      the number of vectors are both compile-time constants.  */
11331   unsigned int nscalars_per_iter
11332     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11333                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
11334
11335   if (rgl->max_nscalars_per_iter < nscalars_per_iter)
11336     {
11337       /* For now, we only support cases in which all loads and stores fall back
11338          to VnQI or none do.  */
11339       gcc_assert (!rgl->max_nscalars_per_iter
11340                   || (rgl->factor == 1 && factor == 1)
11341                   || (rgl->max_nscalars_per_iter * rgl->factor
11342                       == nscalars_per_iter * factor));
11343       rgl->max_nscalars_per_iter = nscalars_per_iter;
11344       rgl->type = vectype;
11345       rgl->factor = factor;
11346     }
11347 }
11348
11349 /* Given a complete set of lengths LENS, extract length number INDEX
11350    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
11351    where 0 <= INDEX < NVECTORS.  Return a value that contains FACTOR
11352    multipled by the number of elements that should be processed.
11353    Insert any set-up statements before GSI.  */
11354
11355 tree
11356 vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
11357                    vec_loop_lens *lens, unsigned int nvectors, tree vectype,
11358                    unsigned int index, unsigned int factor)
11359 {
11360   rgroup_controls *rgl = &(*lens)[nvectors - 1];
11361   bool use_bias_adjusted_len =
11362     LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
11363
11364   /* Populate the rgroup's len array, if this is the first time we've
11365      used it.  */
11366   if (rgl->controls.is_empty ())
11367     {
11368       rgl->controls.safe_grow_cleared (nvectors, true);
11369       for (unsigned int i = 0; i < nvectors; ++i)
11370         {
11371           tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
11372           gcc_assert (len_type != NULL_TREE);
11373
11374           tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
11375
11376           /* Provide a dummy definition until the real one is available.  */
11377           SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
11378           rgl->controls[i] = len;
11379
11380           if (use_bias_adjusted_len)
11381             {
11382               gcc_assert (i == 0);
11383               tree adjusted_len =
11384                 make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
11385               SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
11386               rgl->bias_adjusted_ctrl = adjusted_len;
11387             }
11388         }
11389     }
11390
11391   if (use_bias_adjusted_len)
11392     return rgl->bias_adjusted_ctrl;
11393
11394   tree loop_len = rgl->controls[index];
11395   if (rgl->factor == 1 && factor == 1)
11396     {
11397       poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
11398       poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
11399       if (maybe_ne (nunits1, nunits2))
11400         {
11401           /* A loop len for data type X can be reused for data type Y
11402              if X has N times more elements than Y and if Y's elements
11403              are N times bigger than X's.  */
11404           gcc_assert (multiple_p (nunits1, nunits2));
11405           factor = exact_div (nunits1, nunits2).to_constant ();
11406           tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
11407           gimple_seq seq = NULL;
11408           loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
11409                                    build_int_cst (iv_type, factor));
11410           if (seq)
11411             gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11412         }
11413     }
11414   return loop_len;
11415 }
11416
11417 /* Scale profiling counters by estimation for LOOP which is vectorized
11418    by factor VF.
11419    If FLAT is true, the loop we started with had unrealistically flat
11420    profile.  */
11421
11422 static void
11423 scale_profile_for_vect_loop (class loop *loop, edge exit_e, unsigned vf, bool flat)
11424 {
11425   /* For flat profiles do not scale down proportionally by VF and only
11426      cap by known iteration count bounds.  */
11427   if (flat)
11428     {
11429       if (dump_file && (dump_flags & TDF_DETAILS))
11430         fprintf (dump_file,
11431                  "Vectorized loop profile seems flat; not scaling iteration "
11432                  "count down by the vectorization factor %i\n", vf);
11433       scale_loop_profile (loop, profile_probability::always (),
11434                           get_likely_max_loop_iterations_int (loop));
11435       return;
11436     }
11437   /* Loop body executes VF fewer times and exit increases VF times.  */
11438   profile_count entry_count = loop_preheader_edge (loop)->count ();
11439
11440   /* If we have unreliable loop profile avoid dropping entry
11441      count bellow header count.  This can happen since loops
11442      has unrealistically low trip counts.  */
11443   while (vf > 1
11444          && loop->header->count > entry_count
11445          && loop->header->count < entry_count * vf)
11446     {
11447       if (dump_file && (dump_flags & TDF_DETAILS))
11448         fprintf (dump_file,
11449                  "Vectorization factor %i seems too large for profile "
11450                  "prevoiusly believed to be consistent; reducing.\n", vf);
11451       vf /= 2;
11452     }
11453
11454   if (entry_count.nonzero_p ())
11455     set_edge_probability_and_rescale_others
11456             (exit_e,
11457              entry_count.probability_in (loop->header->count / vf));
11458   /* Avoid producing very large exit probability when we do not have
11459      sensible profile.  */
11460   else if (exit_e->probability < profile_probability::always () / (vf * 2))
11461     set_edge_probability_and_rescale_others (exit_e, exit_e->probability * vf);
11462   loop->latch->count = single_pred_edge (loop->latch)->count ();
11463
11464   scale_loop_profile (loop, profile_probability::always () / vf,
11465                       get_likely_max_loop_iterations_int (loop));
11466 }
11467
11468 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
11469    latch edge values originally defined by it.  */
11470
11471 static void
11472 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
11473                                      stmt_vec_info def_stmt_info)
11474 {
11475   tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
11476   if (!def || TREE_CODE (def) != SSA_NAME)
11477     return;
11478   stmt_vec_info phi_info;
11479   imm_use_iterator iter;
11480   use_operand_p use_p;
11481   FOR_EACH_IMM_USE_FAST (use_p, iter, def)
11482     {
11483       gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p));
11484       if (!phi)
11485         continue;
11486       if (!(gimple_bb (phi)->loop_father->header == gimple_bb (phi)
11487             && (phi_info = loop_vinfo->lookup_stmt (phi))
11488             && STMT_VINFO_RELEVANT_P (phi_info)))
11489         continue;
11490       loop_p loop = gimple_bb (phi)->loop_father;
11491       edge e = loop_latch_edge (loop);
11492       if (PHI_ARG_DEF_FROM_EDGE (phi, e) != def)
11493         continue;
11494
11495       if (VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
11496           && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
11497           && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
11498         {
11499           vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11500           vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11501           gcc_assert (phi_defs.length () == latch_defs.length ());
11502           for (unsigned i = 0; i < phi_defs.length (); ++i)
11503             add_phi_arg (as_a <gphi *> (phi_defs[i]),
11504                          gimple_get_lhs (latch_defs[i]), e,
11505                          gimple_phi_arg_location (phi, e->dest_idx));
11506         }
11507       else if (STMT_VINFO_DEF_TYPE (phi_info) == vect_first_order_recurrence)
11508         {
11509           /* For first order recurrences we have to update both uses of
11510              the latch definition, the one in the PHI node and the one
11511              in the generated VEC_PERM_EXPR.  */
11512           vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11513           vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11514           gcc_assert (phi_defs.length () == latch_defs.length ());
11515           tree phidef = gimple_assign_rhs1 (phi_defs[0]);
11516           gphi *vphi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
11517           for (unsigned i = 0; i < phi_defs.length (); ++i)
11518             {
11519               gassign *perm = as_a <gassign *> (phi_defs[i]);
11520               if (i > 0)
11521                 gimple_assign_set_rhs1 (perm, gimple_get_lhs (latch_defs[i-1]));
11522               gimple_assign_set_rhs2 (perm, gimple_get_lhs (latch_defs[i]));
11523               update_stmt (perm);
11524             }
11525           add_phi_arg (vphi, gimple_get_lhs (latch_defs.last ()), e,
11526                        gimple_phi_arg_location (phi, e->dest_idx));
11527         }
11528     }
11529 }
11530
11531 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
11532    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
11533    stmt_vec_info.  */
11534
11535 static bool
11536 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
11537                           gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
11538 {
11539   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11540   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11541
11542   if (dump_enabled_p ())
11543     dump_printf_loc (MSG_NOTE, vect_location,
11544                      "------>vectorizing statement: %G", stmt_info->stmt);
11545
11546   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
11547     vect_loop_kill_debug_uses (loop, stmt_info);
11548
11549   if (!STMT_VINFO_RELEVANT_P (stmt_info)
11550       && !STMT_VINFO_LIVE_P (stmt_info))
11551     {
11552       if (is_gimple_call (stmt_info->stmt)
11553           && gimple_call_internal_p (stmt_info->stmt, IFN_MASK_CALL))
11554         {
11555           gcc_assert (!gimple_call_lhs (stmt_info->stmt));
11556           *seen_store = stmt_info;
11557           return false;
11558         }
11559       return false;
11560     }
11561
11562   if (STMT_VINFO_VECTYPE (stmt_info))
11563     {
11564       poly_uint64 nunits
11565         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
11566       if (!STMT_SLP_TYPE (stmt_info)
11567           && maybe_ne (nunits, vf)
11568           && dump_enabled_p ())
11569         /* For SLP VF is set according to unrolling factor, and not
11570            to vector size, hence for SLP this print is not valid.  */
11571         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
11572     }
11573
11574   /* Pure SLP statements have already been vectorized.  We still need
11575      to apply loop vectorization to hybrid SLP statements.  */
11576   if (PURE_SLP_STMT (stmt_info))
11577     return false;
11578
11579   if (dump_enabled_p ())
11580     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
11581
11582   if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
11583     *seen_store = stmt_info;
11584
11585   return true;
11586 }
11587
11588 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
11589    in the hash_map with its corresponding values.  */
11590
11591 static tree
11592 find_in_mapping (tree t, void *context)
11593 {
11594   hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
11595
11596   tree *value = mapping->get (t);
11597   return value ? *value : t;
11598 }
11599
11600 /* Update EPILOGUE's loop_vec_info.  EPILOGUE was constructed as a copy of the
11601    original loop that has now been vectorized.
11602
11603    The inits of the data_references need to be advanced with the number of
11604    iterations of the main loop.  This has been computed in vect_do_peeling and
11605    is stored in parameter ADVANCE.  We first restore the data_references
11606    initial offset with the values recored in ORIG_DRS_INIT.
11607
11608    Since the loop_vec_info of this EPILOGUE was constructed for the original
11609    loop, its stmt_vec_infos all point to the original statements.  These need
11610    to be updated to point to their corresponding copies as well as the SSA_NAMES
11611    in their PATTERN_DEF_SEQs and RELATED_STMTs.
11612
11613    The data_reference's connections also need to be updated.  Their
11614    corresponding dr_vec_info need to be reconnected to the EPILOGUE's
11615    stmt_vec_infos, their statements need to point to their corresponding copy,
11616    if they are gather loads or scatter stores then their reference needs to be
11617    updated to point to its corresponding copy and finally we set
11618    'base_misaligned' to false as we have already peeled for alignment in the
11619    prologue of the main loop.  */
11620
11621 static void
11622 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
11623 {
11624   loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
11625   auto_vec<gimple *> stmt_worklist;
11626   hash_map<tree,tree> mapping;
11627   gimple *orig_stmt, *new_stmt;
11628   gimple_stmt_iterator epilogue_gsi;
11629   gphi_iterator epilogue_phi_gsi;
11630   stmt_vec_info stmt_vinfo = NULL, related_vinfo;
11631   basic_block *epilogue_bbs = get_loop_body (epilogue);
11632   unsigned i;
11633
11634   free (LOOP_VINFO_BBS (epilogue_vinfo));
11635   LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
11636
11637   /* Advance data_reference's with the number of iterations of the previous
11638      loop and its prologue.  */
11639   vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
11640
11641
11642   /* The EPILOGUE loop is a copy of the original loop so they share the same
11643      gimple UIDs.  In this loop we update the loop_vec_info of the EPILOGUE to
11644      point to the copied statements.  We also create a mapping of all LHS' in
11645      the original loop and all the LHS' in the EPILOGUE and create worklists to
11646      update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs.  */
11647   for (unsigned i = 0; i < epilogue->num_nodes; ++i)
11648     {
11649       for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
11650            !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
11651         {
11652           new_stmt = epilogue_phi_gsi.phi ();
11653
11654           gcc_assert (gimple_uid (new_stmt) > 0);
11655           stmt_vinfo
11656             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11657
11658           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11659           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11660
11661           mapping.put (gimple_phi_result (orig_stmt),
11662                        gimple_phi_result (new_stmt));
11663           /* PHI nodes can not have patterns or related statements.  */
11664           gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
11665                       && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
11666         }
11667
11668       for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
11669            !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
11670         {
11671           new_stmt = gsi_stmt (epilogue_gsi);
11672           if (is_gimple_debug (new_stmt))
11673             continue;
11674
11675           gcc_assert (gimple_uid (new_stmt) > 0);
11676           stmt_vinfo
11677             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11678
11679           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11680           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11681
11682           if (tree old_lhs = gimple_get_lhs (orig_stmt))
11683             mapping.put (old_lhs, gimple_get_lhs (new_stmt));
11684
11685           if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
11686             {
11687               gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
11688               for (gimple_stmt_iterator gsi = gsi_start (seq);
11689                    !gsi_end_p (gsi); gsi_next (&gsi))
11690                 stmt_worklist.safe_push (gsi_stmt (gsi));
11691             }
11692
11693           related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
11694           if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
11695             {
11696               gimple *stmt = STMT_VINFO_STMT (related_vinfo);
11697               stmt_worklist.safe_push (stmt);
11698               /* Set BB such that the assert in
11699                 'get_initial_def_for_reduction' is able to determine that
11700                 the BB of the related stmt is inside this loop.  */
11701               gimple_set_bb (stmt,
11702                              gimple_bb (new_stmt));
11703               related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
11704               gcc_assert (related_vinfo == NULL
11705                           || related_vinfo == stmt_vinfo);
11706             }
11707         }
11708     }
11709
11710   /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
11711      using the original main loop and thus need to be updated to refer to the
11712      cloned variables used in the epilogue.  */
11713   for (unsigned i = 0; i < stmt_worklist.length (); ++i)
11714     {
11715       gimple *stmt = stmt_worklist[i];
11716       tree *new_op;
11717
11718       for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
11719         {
11720           tree op = gimple_op (stmt, j);
11721           if ((new_op = mapping.get(op)))
11722             gimple_set_op (stmt, j, *new_op);
11723           else
11724             {
11725               /* PR92429: The last argument of simplify_replace_tree disables
11726                  folding when replacing arguments.  This is required as
11727                  otherwise you might end up with different statements than the
11728                  ones analyzed in vect_loop_analyze, leading to different
11729                  vectorization.  */
11730               op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
11731                                           &find_in_mapping, &mapping, false);
11732               gimple_set_op (stmt, j, op);
11733             }
11734         }
11735     }
11736
11737   struct data_reference *dr;
11738   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
11739   FOR_EACH_VEC_ELT (datarefs, i, dr)
11740     {
11741       orig_stmt = DR_STMT (dr);
11742       gcc_assert (gimple_uid (orig_stmt) > 0);
11743       stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
11744       /* Data references for gather loads and scatter stores do not use the
11745          updated offset we set using ADVANCE.  Instead we have to make sure the
11746          reference in the data references point to the corresponding copy of
11747          the original in the epilogue.  Make sure to update both
11748          gather/scatters recognized by dataref analysis and also other
11749          refs that get_load_store_type classified as VMAT_GATHER_SCATTER.  */
11750       auto vstmt_vinfo = vect_stmt_to_vectorize (stmt_vinfo);
11751       if (STMT_VINFO_MEMORY_ACCESS_TYPE (vstmt_vinfo) == VMAT_GATHER_SCATTER
11752           || STMT_VINFO_GATHER_SCATTER_P (vstmt_vinfo))
11753         {
11754           DR_REF (dr)
11755             = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
11756                                      &find_in_mapping, &mapping);
11757           DR_BASE_ADDRESS (dr)
11758             = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
11759                                      &find_in_mapping, &mapping);
11760         }
11761       DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
11762       stmt_vinfo->dr_aux.stmt = stmt_vinfo;
11763       /* The vector size of the epilogue is smaller than that of the main loop
11764          so the alignment is either the same or lower. This means the dr will
11765          thus by definition be aligned.  */
11766       STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
11767     }
11768
11769   epilogue_vinfo->shared->datarefs_copy.release ();
11770   epilogue_vinfo->shared->save_datarefs ();
11771 }
11772
11773 /*  When vectorizing early break statements instructions that happen before
11774     the early break in the current BB need to be moved to after the early
11775     break.  This function deals with that and assumes that any validity
11776     checks has already been performed.
11777
11778     While moving the instructions if it encounters a VUSE or VDEF it then
11779     corrects the VUSES as it moves the statements along.  GDEST is the location
11780     in which to insert the new statements.  */
11781
11782 static void
11783 move_early_exit_stmts (loop_vec_info loop_vinfo)
11784 {
11785   DUMP_VECT_SCOPE ("move_early_exit_stmts");
11786
11787   if (LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).is_empty ())
11788     return;
11789
11790   /* Move all stmts that need moving.  */
11791   basic_block dest_bb = LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo);
11792   gimple_stmt_iterator dest_gsi = gsi_start_bb (dest_bb);
11793
11794   for (gimple *stmt : LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo))
11795     {
11796       /* Check to see if statement is still required for vect or has been
11797          elided.  */
11798       auto stmt_info = loop_vinfo->lookup_stmt (stmt);
11799       if (!stmt_info)
11800         continue;
11801
11802       if (dump_enabled_p ())
11803         dump_printf_loc (MSG_NOTE, vect_location, "moving stmt %G", stmt);
11804
11805       gimple_stmt_iterator stmt_gsi = gsi_for_stmt (stmt);
11806       gsi_move_before (&stmt_gsi, &dest_gsi);
11807       gsi_prev (&dest_gsi);
11808     }
11809
11810   /* Update all the stmts with their new reaching VUSES.  */
11811   tree vuse
11812     = gimple_vuse (LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).last ());
11813   for (auto p : LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo))
11814     {
11815       if (dump_enabled_p ())
11816           dump_printf_loc (MSG_NOTE, vect_location,
11817                            "updating vuse to %T for load %G", vuse, p);
11818       gimple_set_vuse (p, vuse);
11819       update_stmt (p);
11820     }
11821 }
11822
11823 /* Function vect_transform_loop.
11824
11825    The analysis phase has determined that the loop is vectorizable.
11826    Vectorize the loop - created vectorized stmts to replace the scalar
11827    stmts in the loop, and update the loop exit condition.
11828    Returns scalar epilogue loop if any.  */
11829
11830 class loop *
11831 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
11832 {
11833   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11834   class loop *epilogue = NULL;
11835   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
11836   int nbbs = loop->num_nodes;
11837   int i;
11838   tree niters_vector = NULL_TREE;
11839   tree step_vector = NULL_TREE;
11840   tree niters_vector_mult_vf = NULL_TREE;
11841   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11842   unsigned int lowest_vf = constant_lower_bound (vf);
11843   gimple *stmt;
11844   bool check_profitability = false;
11845   unsigned int th;
11846   bool flat = maybe_flat_loop_profile (loop);
11847
11848   DUMP_VECT_SCOPE ("vec_transform_loop");
11849
11850   loop_vinfo->shared->check_datarefs ();
11851
11852   /* Use the more conservative vectorization threshold.  If the number
11853      of iterations is constant assume the cost check has been performed
11854      by our caller.  If the threshold makes all loops profitable that
11855      run at least the (estimated) vectorization factor number of times
11856      checking is pointless, too.  */
11857   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
11858   if (vect_apply_runtime_profitability_check_p (loop_vinfo))
11859     {
11860       if (dump_enabled_p ())
11861         dump_printf_loc (MSG_NOTE, vect_location,
11862                          "Profitability threshold is %d loop iterations.\n",
11863                          th);
11864       check_profitability = true;
11865     }
11866
11867   /* Make sure there exists a single-predecessor exit bb.  Do this before
11868      versioning.   */
11869   edge e = LOOP_VINFO_IV_EXIT (loop_vinfo);
11870   if (! single_pred_p (e->dest) && !LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11871     {
11872       split_loop_exit_edge (e, true);
11873       if (dump_enabled_p ())
11874         dump_printf (MSG_NOTE, "split exit edge\n");
11875     }
11876
11877   /* Version the loop first, if required, so the profitability check
11878      comes first.  */
11879
11880   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
11881     {
11882       class loop *sloop
11883         = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
11884       sloop->force_vectorize = false;
11885       check_profitability = false;
11886     }
11887
11888   /* Make sure there exists a single-predecessor exit bb also on the
11889      scalar loop copy.  Do this after versioning but before peeling
11890      so CFG structure is fine for both scalar and if-converted loop
11891      to make slpeel_duplicate_current_defs_from_edges face matched
11892      loop closed PHI nodes on the exit.  */
11893   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
11894     {
11895       e = LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo);
11896       if (! single_pred_p (e->dest))
11897         {
11898           split_loop_exit_edge (e, true);
11899           if (dump_enabled_p ())
11900             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
11901         }
11902     }
11903
11904   tree niters = vect_build_loop_niters (loop_vinfo);
11905   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
11906   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
11907   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
11908   tree advance;
11909   drs_init_vec orig_drs_init;
11910
11911   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
11912                               &step_vector, &niters_vector_mult_vf, th,
11913                               check_profitability, niters_no_overflow,
11914                               &advance);
11915   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
11916       && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
11917     {
11918       /* Ifcvt duplicates loop preheader, loop body and produces an basic
11919          block after loop exit.  We need to scale all that.  */
11920       basic_block preheader
11921         = loop_preheader_edge (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->src;
11922       preheader->count
11923         = preheader->count.apply_probability
11924               (LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11925       scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
11926                               LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11927       single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->dest->count
11928         = preheader->count;
11929     }
11930
11931   if (niters_vector == NULL_TREE)
11932     {
11933       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
11934           && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
11935           && known_eq (lowest_vf, vf))
11936         {
11937           niters_vector
11938             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
11939                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
11940           step_vector = build_one_cst (TREE_TYPE (niters));
11941         }
11942       else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11943         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
11944                                      &step_vector, niters_no_overflow);
11945       else
11946         /* vect_do_peeling subtracted the number of peeled prologue
11947            iterations from LOOP_VINFO_NITERS.  */
11948         vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
11949                                      &niters_vector, &step_vector,
11950                                      niters_no_overflow);
11951     }
11952
11953   /* 1) Make sure the loop header has exactly two entries
11954      2) Make sure we have a preheader basic block.  */
11955
11956   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
11957
11958   split_edge (loop_preheader_edge (loop));
11959
11960   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11961     /* This will deal with any possible peeling.  */
11962     vect_prepare_for_masked_peels (loop_vinfo);
11963
11964   /* Handle any code motion that we need to for early-break vectorization after
11965      we've done peeling but just before we start vectorizing.  */
11966   if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11967     move_early_exit_stmts (loop_vinfo);
11968
11969   /* Schedule the SLP instances first, then handle loop vectorization
11970      below.  */
11971   if (!loop_vinfo->slp_instances.is_empty ())
11972     {
11973       DUMP_VECT_SCOPE ("scheduling SLP instances");
11974       vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
11975     }
11976
11977   /* FORNOW: the vectorizer supports only loops which body consist
11978      of one basic block (header + empty latch). When the vectorizer will
11979      support more involved loop forms, the order by which the BBs are
11980      traversed need to be reconsidered.  */
11981
11982   for (i = 0; i < nbbs; i++)
11983     {
11984       basic_block bb = bbs[i];
11985       stmt_vec_info stmt_info;
11986
11987       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
11988            gsi_next (&si))
11989         {
11990           gphi *phi = si.phi ();
11991           if (dump_enabled_p ())
11992             dump_printf_loc (MSG_NOTE, vect_location,
11993                              "------>vectorizing phi: %G", (gimple *) phi);
11994           stmt_info = loop_vinfo->lookup_stmt (phi);
11995           if (!stmt_info)
11996             continue;
11997
11998           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
11999             vect_loop_kill_debug_uses (loop, stmt_info);
12000
12001           if (!STMT_VINFO_RELEVANT_P (stmt_info)
12002               && !STMT_VINFO_LIVE_P (stmt_info))
12003             continue;
12004
12005           if (STMT_VINFO_VECTYPE (stmt_info)
12006               && (maybe_ne
12007                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
12008               && dump_enabled_p ())
12009             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
12010
12011           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
12012                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
12013                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
12014                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
12015                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence
12016                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
12017               && ! PURE_SLP_STMT (stmt_info))
12018             {
12019               if (dump_enabled_p ())
12020                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
12021               vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
12022             }
12023         }
12024
12025       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
12026            gsi_next (&si))
12027         {
12028           gphi *phi = si.phi ();
12029           stmt_info = loop_vinfo->lookup_stmt (phi);
12030           if (!stmt_info)
12031             continue;
12032
12033           if (!STMT_VINFO_RELEVANT_P (stmt_info)
12034               && !STMT_VINFO_LIVE_P (stmt_info))
12035             continue;
12036
12037           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
12038                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
12039                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
12040                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
12041                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
12042                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence)
12043               && ! PURE_SLP_STMT (stmt_info))
12044             maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
12045         }
12046
12047       for (gimple_stmt_iterator si = gsi_start_bb (bb);
12048            !gsi_end_p (si);)
12049         {
12050           stmt = gsi_stmt (si);
12051           /* During vectorization remove existing clobber stmts.  */
12052           if (gimple_clobber_p (stmt))
12053             {
12054               unlink_stmt_vdef (stmt);
12055               gsi_remove (&si, true);
12056               release_defs (stmt);
12057             }
12058           else
12059             {
12060               /* Ignore vector stmts created in the outer loop.  */
12061               stmt_info = loop_vinfo->lookup_stmt (stmt);
12062
12063               /* vector stmts created in the outer-loop during vectorization of
12064                  stmts in an inner-loop may not have a stmt_info, and do not
12065                  need to be vectorized.  */
12066               stmt_vec_info seen_store = NULL;
12067               if (stmt_info)
12068                 {
12069                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
12070                     {
12071                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
12072                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
12073                            !gsi_end_p (subsi); gsi_next (&subsi))
12074                         {
12075                           stmt_vec_info pat_stmt_info
12076                             = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
12077                           vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
12078                                                     &si, &seen_store);
12079                         }
12080                       stmt_vec_info pat_stmt_info
12081                         = STMT_VINFO_RELATED_STMT (stmt_info);
12082                       if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
12083                                                     &si, &seen_store))
12084                         maybe_set_vectorized_backedge_value (loop_vinfo,
12085                                                              pat_stmt_info);
12086                     }
12087                   else
12088                     {
12089                       if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
12090                                                     &seen_store))
12091                         maybe_set_vectorized_backedge_value (loop_vinfo,
12092                                                              stmt_info);
12093                     }
12094                 }
12095               gsi_next (&si);
12096               if (seen_store)
12097                 {
12098                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
12099                     /* Interleaving.  If IS_STORE is TRUE, the
12100                        vectorization of the interleaving chain was
12101                        completed - free all the stores in the chain.  */
12102                     vect_remove_stores (loop_vinfo,
12103                                         DR_GROUP_FIRST_ELEMENT (seen_store));
12104                   else
12105                     /* Free the attached stmt_vec_info and remove the stmt.  */
12106                     loop_vinfo->remove_stmt (stmt_info);
12107                 }
12108             }
12109         }
12110
12111       /* Stub out scalar statements that must not survive vectorization.
12112          Doing this here helps with grouped statements, or statements that
12113          are involved in patterns.  */
12114       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
12115            !gsi_end_p (gsi); gsi_next (&gsi))
12116         {
12117           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
12118           if (!call || !gimple_call_internal_p (call))
12119             continue;
12120           internal_fn ifn = gimple_call_internal_fn (call);
12121           if (ifn == IFN_MASK_LOAD)
12122             {
12123               tree lhs = gimple_get_lhs (call);
12124               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12125                 {
12126                   tree zero = build_zero_cst (TREE_TYPE (lhs));
12127                   gimple *new_stmt = gimple_build_assign (lhs, zero);
12128                   gsi_replace (&gsi, new_stmt, true);
12129                 }
12130             }
12131           else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
12132             {
12133               tree lhs = gimple_get_lhs (call);
12134               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12135                 {
12136                   tree else_arg
12137                     = gimple_call_arg (call, gimple_call_num_args (call) - 1);
12138                   gimple *new_stmt = gimple_build_assign (lhs, else_arg);
12139                   gsi_replace (&gsi, new_stmt, true);
12140                 }
12141             }
12142         }
12143     }                           /* BBs in loop */
12144
12145   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
12146      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
12147   if (integer_onep (step_vector))
12148     niters_no_overflow = true;
12149   vect_set_loop_condition (loop, LOOP_VINFO_IV_EXIT (loop_vinfo), loop_vinfo,
12150                            niters_vector, step_vector, niters_vector_mult_vf,
12151                            !niters_no_overflow);
12152
12153   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
12154
12155   /* True if the final iteration might not handle a full vector's
12156      worth of scalar iterations.  */
12157   bool final_iter_may_be_partial
12158     = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
12159   /* The minimum number of iterations performed by the epilogue.  This
12160      is 1 when peeling for gaps because we always need a final scalar
12161      iteration.  */
12162   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
12163   /* +1 to convert latch counts to loop iteration counts,
12164      -min_epilogue_iters to remove iterations that cannot be performed
12165        by the vector code.  */
12166   int bias_for_lowest = 1 - min_epilogue_iters;
12167   int bias_for_assumed = bias_for_lowest;
12168   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
12169   if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
12170     {
12171       /* When the amount of peeling is known at compile time, the first
12172          iteration will have exactly alignment_npeels active elements.
12173          In the worst case it will have at least one.  */
12174       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
12175       bias_for_lowest += lowest_vf - min_first_active;
12176       bias_for_assumed += assumed_vf - min_first_active;
12177     }
12178   /* In these calculations the "- 1" converts loop iteration counts
12179      back to latch counts.  */
12180   if (loop->any_upper_bound)
12181     {
12182       loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
12183       loop->nb_iterations_upper_bound
12184         = (final_iter_may_be_partial
12185            ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
12186                             lowest_vf) - 1
12187            : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
12188                              lowest_vf) - 1);
12189       if (main_vinfo
12190           /* Both peeling for alignment and peeling for gaps can end up
12191              with the scalar epilogue running for more than VF-1 iterations.  */
12192           && !main_vinfo->peeling_for_alignment
12193           && !main_vinfo->peeling_for_gaps)
12194         {
12195           unsigned int bound;
12196           poly_uint64 main_iters
12197             = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
12198                            LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
12199           main_iters
12200             = upper_bound (main_iters,
12201                            LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
12202           if (can_div_away_from_zero_p (main_iters,
12203                                         LOOP_VINFO_VECT_FACTOR (loop_vinfo),
12204                                         &bound))
12205             loop->nb_iterations_upper_bound
12206               = wi::umin ((bound_wide_int) (bound - 1),
12207                           loop->nb_iterations_upper_bound);
12208       }
12209   }
12210   if (loop->any_likely_upper_bound)
12211     loop->nb_iterations_likely_upper_bound
12212       = (final_iter_may_be_partial
12213          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
12214                           + bias_for_lowest, lowest_vf) - 1
12215          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
12216                            + bias_for_lowest, lowest_vf) - 1);
12217   if (loop->any_estimate)
12218     loop->nb_iterations_estimate
12219       = (final_iter_may_be_partial
12220          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
12221                           assumed_vf) - 1
12222          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
12223                            assumed_vf) - 1);
12224   scale_profile_for_vect_loop (loop, LOOP_VINFO_IV_EXIT (loop_vinfo),
12225                                assumed_vf, flat);
12226
12227   if (dump_enabled_p ())
12228     {
12229       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
12230         {
12231           dump_printf_loc (MSG_NOTE, vect_location,
12232                            "LOOP VECTORIZED\n");
12233           if (loop->inner)
12234             dump_printf_loc (MSG_NOTE, vect_location,
12235                              "OUTER LOOP VECTORIZED\n");
12236           dump_printf (MSG_NOTE, "\n");
12237         }
12238       else
12239         dump_printf_loc (MSG_NOTE, vect_location,
12240                          "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
12241                          GET_MODE_NAME (loop_vinfo->vector_mode));
12242     }
12243
12244   /* Loops vectorized with a variable factor won't benefit from
12245      unrolling/peeling.  */
12246   if (!vf.is_constant ())
12247     {
12248       loop->unroll = 1;
12249       if (dump_enabled_p ())
12250         dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
12251                          " variable-length vectorization factor\n");
12252     }
12253   /* Free SLP instances here because otherwise stmt reference counting
12254      won't work.  */
12255   slp_instance instance;
12256   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
12257     vect_free_slp_instance (instance);
12258   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
12259   /* Clear-up safelen field since its value is invalid after vectorization
12260      since vectorized loop can have loop-carried dependencies.  */
12261   loop->safelen = 0;
12262
12263   if (epilogue)
12264     {
12265       update_epilogue_loop_vinfo (epilogue, advance);
12266
12267       epilogue->simduid = loop->simduid;
12268       epilogue->force_vectorize = loop->force_vectorize;
12269       epilogue->dont_vectorize = false;
12270     }
12271
12272   return epilogue;
12273 }
12274
12275 /* The code below is trying to perform simple optimization - revert
12276    if-conversion for masked stores, i.e. if the mask of a store is zero
12277    do not perform it and all stored value producers also if possible.
12278    For example,
12279      for (i=0; i<n; i++)
12280        if (c[i])
12281         {
12282           p1[i] += 1;
12283           p2[i] = p3[i] +2;
12284         }
12285    this transformation will produce the following semi-hammock:
12286
12287    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
12288      {
12289        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
12290        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
12291        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
12292        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
12293        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
12294        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
12295      }
12296 */
12297
12298 void
12299 optimize_mask_stores (class loop *loop)
12300 {
12301   basic_block *bbs = get_loop_body (loop);
12302   unsigned nbbs = loop->num_nodes;
12303   unsigned i;
12304   basic_block bb;
12305   class loop *bb_loop;
12306   gimple_stmt_iterator gsi;
12307   gimple *stmt;
12308   auto_vec<gimple *> worklist;
12309   auto_purge_vect_location sentinel;
12310
12311   vect_location = find_loop_location (loop);
12312   /* Pick up all masked stores in loop if any.  */
12313   for (i = 0; i < nbbs; i++)
12314     {
12315       bb = bbs[i];
12316       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
12317            gsi_next (&gsi))
12318         {
12319           stmt = gsi_stmt (gsi);
12320           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
12321             worklist.safe_push (stmt);
12322         }
12323     }
12324
12325   free (bbs);
12326   if (worklist.is_empty ())
12327     return;
12328
12329   /* Loop has masked stores.  */
12330   while (!worklist.is_empty ())
12331     {
12332       gimple *last, *last_store;
12333       edge e, efalse;
12334       tree mask;
12335       basic_block store_bb, join_bb;
12336       gimple_stmt_iterator gsi_to;
12337       tree vdef, new_vdef;
12338       gphi *phi;
12339       tree vectype;
12340       tree zero;
12341
12342       last = worklist.pop ();
12343       mask = gimple_call_arg (last, 2);
12344       bb = gimple_bb (last);
12345       /* Create then_bb and if-then structure in CFG, then_bb belongs to
12346          the same loop as if_bb.  It could be different to LOOP when two
12347          level loop-nest is vectorized and mask_store belongs to the inner
12348          one.  */
12349       e = split_block (bb, last);
12350       bb_loop = bb->loop_father;
12351       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
12352       join_bb = e->dest;
12353       store_bb = create_empty_bb (bb);
12354       add_bb_to_loop (store_bb, bb_loop);
12355       e->flags = EDGE_TRUE_VALUE;
12356       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
12357       /* Put STORE_BB to likely part.  */
12358       efalse->probability = profile_probability::likely ();
12359       e->probability = efalse->probability.invert ();
12360       store_bb->count = efalse->count ();
12361       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
12362       if (dom_info_available_p (CDI_DOMINATORS))
12363         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
12364       if (dump_enabled_p ())
12365         dump_printf_loc (MSG_NOTE, vect_location,
12366                          "Create new block %d to sink mask stores.",
12367                          store_bb->index);
12368       /* Create vector comparison with boolean result.  */
12369       vectype = TREE_TYPE (mask);
12370       zero = build_zero_cst (vectype);
12371       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
12372       gsi = gsi_last_bb (bb);
12373       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
12374       /* Create new PHI node for vdef of the last masked store:
12375          .MEM_2 = VDEF <.MEM_1>
12376          will be converted to
12377          .MEM.3 = VDEF <.MEM_1>
12378          and new PHI node will be created in join bb
12379          .MEM_2 = PHI <.MEM_1, .MEM_3>
12380       */
12381       vdef = gimple_vdef (last);
12382       new_vdef = make_ssa_name (gimple_vop (cfun), last);
12383       gimple_set_vdef (last, new_vdef);
12384       phi = create_phi_node (vdef, join_bb);
12385       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
12386
12387       /* Put all masked stores with the same mask to STORE_BB if possible.  */
12388       while (true)
12389         {
12390           gimple_stmt_iterator gsi_from;
12391           gimple *stmt1 = NULL;
12392
12393           /* Move masked store to STORE_BB.  */
12394           last_store = last;
12395           gsi = gsi_for_stmt (last);
12396           gsi_from = gsi;
12397           /* Shift GSI to the previous stmt for further traversal.  */
12398           gsi_prev (&gsi);
12399           gsi_to = gsi_start_bb (store_bb);
12400           gsi_move_before (&gsi_from, &gsi_to);
12401           /* Setup GSI_TO to the non-empty block start.  */
12402           gsi_to = gsi_start_bb (store_bb);
12403           if (dump_enabled_p ())
12404             dump_printf_loc (MSG_NOTE, vect_location,
12405                              "Move stmt to created bb\n%G", last);
12406           /* Move all stored value producers if possible.  */
12407           while (!gsi_end_p (gsi))
12408             {
12409               tree lhs;
12410               imm_use_iterator imm_iter;
12411               use_operand_p use_p;
12412               bool res;
12413
12414               /* Skip debug statements.  */
12415               if (is_gimple_debug (gsi_stmt (gsi)))
12416                 {
12417                   gsi_prev (&gsi);
12418                   continue;
12419                 }
12420               stmt1 = gsi_stmt (gsi);
12421               /* Do not consider statements writing to memory or having
12422                  volatile operand.  */
12423               if (gimple_vdef (stmt1)
12424                   || gimple_has_volatile_ops (stmt1))
12425                 break;
12426               gsi_from = gsi;
12427               gsi_prev (&gsi);
12428               lhs = gimple_get_lhs (stmt1);
12429               if (!lhs)
12430                 break;
12431
12432               /* LHS of vectorized stmt must be SSA_NAME.  */
12433               if (TREE_CODE (lhs) != SSA_NAME)
12434                 break;
12435
12436               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12437                 {
12438                   /* Remove dead scalar statement.  */
12439                   if (has_zero_uses (lhs))
12440                     {
12441                       gsi_remove (&gsi_from, true);
12442                       continue;
12443                     }
12444                 }
12445
12446               /* Check that LHS does not have uses outside of STORE_BB.  */
12447               res = true;
12448               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
12449                 {
12450                   gimple *use_stmt;
12451                   use_stmt = USE_STMT (use_p);
12452                   if (is_gimple_debug (use_stmt))
12453                     continue;
12454                   if (gimple_bb (use_stmt) != store_bb)
12455                     {
12456                       res = false;
12457                       break;
12458                     }
12459                 }
12460               if (!res)
12461                 break;
12462
12463               if (gimple_vuse (stmt1)
12464                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
12465                 break;
12466
12467               /* Can move STMT1 to STORE_BB.  */
12468               if (dump_enabled_p ())
12469                 dump_printf_loc (MSG_NOTE, vect_location,
12470                                  "Move stmt to created bb\n%G", stmt1);
12471               gsi_move_before (&gsi_from, &gsi_to);
12472               /* Shift GSI_TO for further insertion.  */
12473               gsi_prev (&gsi_to);
12474             }
12475           /* Put other masked stores with the same mask to STORE_BB.  */
12476           if (worklist.is_empty ()
12477               || gimple_call_arg (worklist.last (), 2) != mask
12478               || worklist.last () != stmt1)
12479             break;
12480           last = worklist.pop ();
12481         }
12482       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
12483     }
12484 }
12485
12486 /* Decide whether it is possible to use a zero-based induction variable
12487    when vectorizing LOOP_VINFO with partial vectors.  If it is, return
12488    the value that the induction variable must be able to hold in order
12489    to ensure that the rgroups eventually have no active vector elements.
12490    Return -1 otherwise.  */
12491
12492 widest_int
12493 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
12494 {
12495   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
12496   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
12497   unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
12498
12499   /* Calculate the value that the induction variable must be able
12500      to hit in order to ensure that we end the loop with an all-false mask.
12501      This involves adding the maximum number of inactive trailing scalar
12502      iterations.  */
12503   widest_int iv_limit = -1;
12504   if (max_loop_iterations (loop, &iv_limit))
12505     {
12506       if (niters_skip)
12507         {
12508           /* Add the maximum number of skipped iterations to the
12509              maximum iteration count.  */
12510           if (TREE_CODE (niters_skip) == INTEGER_CST)
12511             iv_limit += wi::to_widest (niters_skip);
12512           else
12513             iv_limit += max_vf - 1;
12514         }
12515       else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
12516         /* Make a conservatively-correct assumption.  */
12517         iv_limit += max_vf - 1;
12518
12519       /* IV_LIMIT is the maximum number of latch iterations, which is also
12520          the maximum in-range IV value.  Round this value down to the previous
12521          vector alignment boundary and then add an extra full iteration.  */
12522       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
12523       iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
12524     }
12525   return iv_limit;
12526 }
12527
12528 /* For the given rgroup_controls RGC, check whether an induction variable
12529    would ever hit a value that produces a set of all-false masks or zero
12530    lengths before wrapping around.  Return true if it's possible to wrap
12531    around before hitting the desirable value, otherwise return false.  */
12532
12533 bool
12534 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
12535 {
12536   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
12537
12538   if (iv_limit == -1)
12539     return true;
12540
12541   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
12542   unsigned int compare_precision = TYPE_PRECISION (compare_type);
12543   unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
12544
12545   if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
12546     return true;
12547
12548   return false;
12549 }