gcc/tree-vect-loop.cc

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2024 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #define INCLUDE_ALGORITHM
  23 #include "config.h"
  24 #include "system.h"
  25 #include "coretypes.h"
  26 #include "backend.h"
  27 #include "target.h"
  28 #include "rtl.h"
  29 #include "tree.h"
  30 #include "gimple.h"
  31 #include "cfghooks.h"
  32 #include "tree-pass.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "memmodel.h"
  36 #include "optabs.h"
  37 #include "diagnostic-core.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "cfganal.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop-niter.h"
  47 #include "tree-ssa-loop.h"
  48 #include "cfgloop.h"
  49 #include "tree-scalar-evolution.h"
  50 #include "tree-vectorizer.h"
  51 #include "gimple-fold.h"
  52 #include "cgraph.h"
  53 #include "tree-cfg.h"
  54 #include "tree-if-conv.h"
  55 #include "internal-fn.h"
  56 #include "tree-vector-builder.h"
  57 #include "vec-perm-indices.h"
  58 #include "tree-eh.h"
  59 #include "case-cfn-macros.h"
  60 #include "langhooks.h"
  61
  62 /* Loop Vectorization Pass.
  63
  64    This pass tries to vectorize loops.
  65
  66    For example, the vectorizer transforms the following simple loop:
  67
  68         short a[N]; short b[N]; short c[N]; int i;
  69
  70         for (i=0; i<N; i++){
  71           a[i] = b[i] + c[i];
  72         }
  73
  74    as if it was manually vectorized by rewriting the source code into:
  75
  76         typedef int __attribute__((mode(V8HI))) v8hi;
  77         short a[N];  short b[N]; short c[N];   int i;
  78         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  79         v8hi va, vb, vc;
  80
  81         for (i=0; i<N/8; i++){
  82           vb = pb[i];
  83           vc = pc[i];
  84           va = vb + vc;
  85           pa[i] = va;
  86         }
  87
  88         The main entry to this pass is vectorize_loops(), in which
  89    the vectorizer applies a set of analyses on a given set of loops,
  90    followed by the actual vectorization transformation for the loops that
  91    had successfully passed the analysis phase.
  92         Throughout this pass we make a distinction between two types of
  93    data: scalars (which are represented by SSA_NAMES), and memory references
  94    ("data-refs").  These two types of data require different handling both
  95    during analysis and transformation. The types of data-refs that the
  96    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  97    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  98    accesses are required to have a simple (consecutive) access pattern.
  99
 100    Analysis phase:
 101    ===============
 102         The driver for the analysis phase is vect_analyze_loop().
 103    It applies a set of analyses, some of which rely on the scalar evolution
 104    analyzer (scev) developed by Sebastian Pop.
 105
 106         During the analysis phase the vectorizer records some information
 107    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 108    loop, as well as general information about the loop as a whole, which is
 109    recorded in a "loop_vec_info" struct attached to each loop.
 110
 111    Transformation phase:
 112    =====================
 113         The loop transformation phase scans all the stmts in the loop, and
 114    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 115    the loop that needs to be vectorized.  It inserts the vector code sequence
 116    just before the scalar stmt S, and records a pointer to the vector code
 117    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 118    attached to S).  This pointer will be used for the vectorization of following
 119    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 120    otherwise, we rely on dead code elimination for removing it.
 121
 122         For example, say stmt S1 was vectorized into stmt VS1:
 123
 124    VS1: vb = px[i];
 125    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 126    S2:  a = b;
 127
 128    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 129    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 130    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 131    resulting sequence would be:
 132
 133    VS1: vb = px[i];
 134    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 135    VS2: va = vb;
 136    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 137
 138         Operands that are not SSA_NAMEs, are data-refs that appear in
 139    load/store operations (like 'x[i]' in S1), and are handled differently.
 140
 141    Target modeling:
 142    =================
 143         Currently the only target specific information that is used is the
 144    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 145    Targets that can support different sizes of vectors, for now will need
 146    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 147    flexibility will be added in the future.
 148
 149         Since we only vectorize operations which vector form can be
 150    expressed using existing tree codes, to verify that an operation is
 151    supported, the vectorizer checks the relevant optab at the relevant
 152    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 153    the value found is CODE_FOR_nothing, then there's no target support, and
 154    we can't vectorize the stmt.
 155
 156    For additional information on this project see:
 157    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 158 */
 159
 160 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
 161                                                 unsigned *);
 162 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
 163                                                bool *, bool *, bool);
 164
 165 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 166    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 167    may already be set for general statements (not just data refs).  */
 168
 169 static opt_result
 170 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
 171                               bool vectype_maybe_set_p,
 172                               poly_uint64 *vf)
 173 {
 174   gimple *stmt = stmt_info->stmt;
 175
 176   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 177        && !STMT_VINFO_LIVE_P (stmt_info))
 178       || gimple_clobber_p (stmt))
 179     {
 180       if (dump_enabled_p ())
 181         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 182       return opt_result::success ();
 183     }
 184
 185   tree stmt_vectype, nunits_vectype;
 186   opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
 187                                                    &stmt_vectype,
 188                                                    &nunits_vectype);
 189   if (!res)
 190     return res;
 191
 192   if (stmt_vectype)
 193     {
 194       if (STMT_VINFO_VECTYPE (stmt_info))
 195         /* The only case when a vectype had been already set is for stmts
 196            that contain a data ref, or for "pattern-stmts" (stmts generated
 197            by the vectorizer to represent/replace a certain idiom).  */
 198         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 199                      || vectype_maybe_set_p)
 200                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 201       else
 202         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 203     }
 204
 205   if (nunits_vectype)
 206     vect_update_max_nunits (vf, nunits_vectype);
 207
 208   return opt_result::success ();
 209 }
 210
 211 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 212    types of STMT_INFO and all attached pattern statements and update
 213    the vectorization factor VF accordingly.  Return true on success
 214    or false if something prevented vectorization.  */
 215
 216 static opt_result
 217 vect_determine_vf_for_stmt (vec_info *vinfo,
 218                             stmt_vec_info stmt_info, poly_uint64 *vf)
 219 {
 220   if (dump_enabled_p ())
 221     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
 222                      stmt_info->stmt);
 223   opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
 224   if (!res)
 225     return res;
 226
 227   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 228       && STMT_VINFO_RELATED_STMT (stmt_info))
 229     {
 230       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 231       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
 232
 233       /* If a pattern statement has def stmts, analyze them too.  */
 234       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 235            !gsi_end_p (si); gsi_next (&si))
 236         {
 237           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 238           if (dump_enabled_p ())
 239             dump_printf_loc (MSG_NOTE, vect_location,
 240                              "==> examining pattern def stmt: %G",
 241                              def_stmt_info->stmt);
 242           res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
 243           if (!res)
 244             return res;
 245         }
 246
 247       if (dump_enabled_p ())
 248         dump_printf_loc (MSG_NOTE, vect_location,
 249                          "==> examining pattern statement: %G",
 250                          stmt_info->stmt);
 251       res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
 252       if (!res)
 253         return res;
 254     }
 255
 256   return opt_result::success ();
 257 }
 258
 259 /* Function vect_determine_vectorization_factor
 260
 261    Determine the vectorization factor (VF).  VF is the number of data elements
 262    that are operated upon in parallel in a single iteration of the vectorized
 263    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 264    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 265    elements can fit in a single vector register.
 266
 267    We currently support vectorization of loops in which all types operated upon
 268    are of the same size.  Therefore this function currently sets VF according to
 269    the size of the types operated upon, and fails if there are multiple sizes
 270    in the loop.
 271
 272    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 273    original loop:
 274         for (i=0; i<N; i++){
 275           a[i] = b[i] + c[i];
 276         }
 277
 278    vectorized loop:
 279         for (i=0; i<N; i+=VF){
 280           a[i:VF] = b[i:VF] + c[i:VF];
 281         }
 282 */
 283
 284 static opt_result
 285 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 286 {
 287   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 288   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 289   unsigned nbbs = loop->num_nodes;
 290   poly_uint64 vectorization_factor = 1;
 291   tree scalar_type = NULL_TREE;
 292   gphi *phi;
 293   tree vectype;
 294   stmt_vec_info stmt_info;
 295   unsigned i;
 296
 297   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 298
 299   for (i = 0; i < nbbs; i++)
 300     {
 301       basic_block bb = bbs[i];
 302
 303       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 304            gsi_next (&si))
 305         {
 306           phi = si.phi ();
 307           stmt_info = loop_vinfo->lookup_stmt (phi);
 308           if (dump_enabled_p ())
 309             dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
 310                              (gimple *) phi);
 311
 312           gcc_assert (stmt_info);
 313
 314           if (STMT_VINFO_RELEVANT_P (stmt_info)
 315               || STMT_VINFO_LIVE_P (stmt_info))
 316             {
 317               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 318               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 319
 320               if (dump_enabled_p ())
 321                 dump_printf_loc (MSG_NOTE, vect_location,
 322                                  "get vectype for scalar type:  %T\n",
 323                                  scalar_type);
 324
 325               vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 326               if (!vectype)
 327                 return opt_result::failure_at (phi,
 328                                                "not vectorized: unsupported "
 329                                                "data-type %T\n",
 330                                                scalar_type);
 331               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 332
 333               if (dump_enabled_p ())
 334                 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
 335                                  vectype);
 336
 337               if (dump_enabled_p ())
 338                 {
 339                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 340                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 341                   dump_printf (MSG_NOTE, "\n");
 342                 }
 343
 344               vect_update_max_nunits (&vectorization_factor, vectype);
 345             }
 346         }
 347
 348       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 349            gsi_next (&si))
 350         {
 351           if (is_gimple_debug (gsi_stmt (si)))
 352             continue;
 353           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 354           opt_result res
 355             = vect_determine_vf_for_stmt (loop_vinfo,
 356                                           stmt_info, &vectorization_factor);
 357           if (!res)
 358             return res;
 359         }
 360     }
 361
 362   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 363   if (dump_enabled_p ())
 364     {
 365       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 366       dump_dec (MSG_NOTE, vectorization_factor);
 367       dump_printf (MSG_NOTE, "\n");
 368     }
 369
 370   if (known_le (vectorization_factor, 1U))
 371     return opt_result::failure_at (vect_location,
 372                                    "not vectorized: unsupported data-type\n");
 373   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 374   return opt_result::success ();
 375 }
 376
 377
 378 /* Function vect_is_simple_iv_evolution.
 379
 380    FORNOW: A simple evolution of an induction variables in the loop is
 381    considered a polynomial evolution.  */
 382
 383 static bool
 384 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 385                              tree * step)
 386 {
 387   tree init_expr;
 388   tree step_expr;
 389   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 390   basic_block bb;
 391
 392   /* When there is no evolution in this loop, the evolution function
 393      is not "simple".  */
 394   if (evolution_part == NULL_TREE)
 395     return false;
 396
 397   /* When the evolution is a polynomial of degree >= 2
 398      the evolution function is not "simple".  */
 399   if (tree_is_chrec (evolution_part))
 400     return false;
 401
 402   step_expr = evolution_part;
 403   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 404
 405   if (dump_enabled_p ())
 406     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
 407                      step_expr, init_expr);
 408
 409   *init = init_expr;
 410   *step = step_expr;
 411
 412   if (TREE_CODE (step_expr) != INTEGER_CST
 413       && (TREE_CODE (step_expr) != SSA_NAME
 414           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 415               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 416           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 417               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 418                   || !flag_associative_math)))
 419       && (TREE_CODE (step_expr) != REAL_CST
 420           || !flag_associative_math))
 421     {
 422       if (dump_enabled_p ())
 423         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 424                          "step unknown.\n");
 425       return false;
 426     }
 427
 428   return true;
 429 }
 430
 431 /* Function vect_is_nonlinear_iv_evolution
 432
 433    Only support nonlinear induction for integer type
 434    1. neg
 435    2. mul by constant
 436    3. lshift/rshift by constant.
 437
 438    For neg induction, return a fake step as integer -1.  */
 439 static bool
 440 vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
 441                                 gphi* loop_phi_node, tree *init, tree *step)
 442 {
 443   tree init_expr, ev_expr, result, op1, op2;
 444   gimple* def;
 445
 446   if (gimple_phi_num_args (loop_phi_node) != 2)
 447     return false;
 448
 449   init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop));
 450   ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop));
 451
 452   /* Support nonlinear induction only for integer type.  */
 453   if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
 454     return false;
 455
 456   *init = init_expr;
 457   result = PHI_RESULT (loop_phi_node);
 458
 459   if (TREE_CODE (ev_expr) != SSA_NAME
 460       || ((def = SSA_NAME_DEF_STMT (ev_expr)), false)
 461       || !is_gimple_assign (def))
 462     return false;
 463
 464   enum tree_code t_code = gimple_assign_rhs_code (def);
 465   switch (t_code)
 466     {
 467     case NEGATE_EXPR:
 468       if (gimple_assign_rhs1 (def) != result)
 469         return false;
 470       *step = build_int_cst (TREE_TYPE (init_expr), -1);
 471       STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
 472       break;
 473
 474     case RSHIFT_EXPR:
 475     case LSHIFT_EXPR:
 476     case MULT_EXPR:
 477       op1 = gimple_assign_rhs1 (def);
 478       op2 = gimple_assign_rhs2 (def);
 479       if (TREE_CODE (op2) != INTEGER_CST
 480           || op1 != result)
 481         return false;
 482       *step = op2;
 483       if (t_code == LSHIFT_EXPR)
 484         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
 485       else if (t_code == RSHIFT_EXPR)
 486         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
 487       /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul.  */
 488       else
 489         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
 490       break;
 491
 492     default:
 493       return false;
 494     }
 495
 496   STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = *init;
 497   STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = *step;
 498
 499   return true;
 500 }
 501
 502 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
 503    what we are assuming is a double reduction.  For example, given
 504    a structure like this:
 505
 506       outer1:
 507         x_1 = PHI <x_4(outer2), ...>;
 508         ...
 509
 510       inner:
 511         x_2 = PHI <x_1(outer1), ...>;
 512         ...
 513         x_3 = ...;
 514         ...
 515
 516       outer2:
 517         x_4 = PHI <x_3(inner)>;
 518         ...
 519
 520    outer loop analysis would treat x_1 as a double reduction phi and
 521    this function would then return true for x_2.  */
 522
 523 static bool
 524 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
 525 {
 526   use_operand_p use_p;
 527   ssa_op_iter op_iter;
 528   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
 529     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
 530       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
 531         return true;
 532   return false;
 533 }
 534
 535 /* Returns true if Phi is a first-order recurrence. A first-order
 536    recurrence is a non-reduction recurrence relation in which the value of
 537    the recurrence in the current loop iteration equals a value defined in
 538    the previous iteration.  */
 539
 540 static bool
 541 vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop,
 542                                    gphi *phi)
 543 {
 544   /* A nested cycle isn't vectorizable as first order recurrence.  */
 545   if (LOOP_VINFO_LOOP (loop_vinfo) != loop)
 546     return false;
 547
 548   /* Ensure the loop latch definition is from within the loop.  */
 549   edge latch = loop_latch_edge (loop);
 550   tree ldef = PHI_ARG_DEF_FROM_EDGE (phi, latch);
 551   if (TREE_CODE (ldef) != SSA_NAME
 552       || SSA_NAME_IS_DEFAULT_DEF (ldef)
 553       || is_a <gphi *> (SSA_NAME_DEF_STMT (ldef))
 554       || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (ldef))))
 555     return false;
 556
 557   tree def = gimple_phi_result (phi);
 558
 559   /* Ensure every use_stmt of the phi node is dominated by the latch
 560      definition.  */
 561   imm_use_iterator imm_iter;
 562   use_operand_p use_p;
 563   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
 564     if (!is_gimple_debug (USE_STMT (use_p))
 565         && (SSA_NAME_DEF_STMT (ldef) == USE_STMT (use_p)
 566             || !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (ldef),
 567                                             USE_STMT (use_p))))
 568       return false;
 569
 570   /* First-order recurrence autovectorization needs shuffle vector.  */
 571   tree scalar_type = TREE_TYPE (def);
 572   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 573   if (!vectype)
 574     return false;
 575
 576   return true;
 577 }
 578
 579 /* Function vect_analyze_scalar_cycles_1.
 580
 581    Examine the cross iteration def-use cycles of scalar variables
 582    in LOOP.  LOOP_VINFO represents the loop that is now being
 583    considered for vectorization (can be LOOP, or an outer-loop
 584    enclosing LOOP).  SLP indicates there will be some subsequent
 585    slp analyses or not.  */
 586
 587 static void
 588 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop,
 589                               bool slp)
 590 {
 591   basic_block bb = loop->header;
 592   tree init, step;
 593   auto_vec<stmt_vec_info, 64> worklist;
 594   gphi_iterator gsi;
 595   bool double_reduc, reduc_chain;
 596
 597   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 598
 599   /* First - identify all inductions.  Reduction detection assumes that all the
 600      inductions have been identified, therefore, this order must not be
 601      changed.  */
 602   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 603     {
 604       gphi *phi = gsi.phi ();
 605       tree access_fn = NULL;
 606       tree def = PHI_RESULT (phi);
 607       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 608
 609       if (dump_enabled_p ())
 610         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
 611                          (gimple *) phi);
 612
 613       /* Skip virtual phi's.  The data dependences that are associated with
 614          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 615       if (virtual_operand_p (def))
 616         continue;
 617
 618       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 619
 620       /* Analyze the evolution function.  */
 621       access_fn = analyze_scalar_evolution (loop, def);
 622       if (access_fn)
 623         {
 624           STRIP_NOPS (access_fn);
 625           if (dump_enabled_p ())
 626             dump_printf_loc (MSG_NOTE, vect_location,
 627                              "Access function of PHI: %T\n", access_fn);
 628           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 629             = initial_condition_in_loop_num (access_fn, loop->num);
 630           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 631             = evolution_part_in_loop_num (access_fn, loop->num);
 632         }
 633
 634       if ((!access_fn
 635            || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
 636            || !vect_is_simple_iv_evolution (loop->num, access_fn,
 637                                             &init, &step)
 638            || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 639                && TREE_CODE (step) != INTEGER_CST))
 640           /* Only handle nonlinear iv for same loop.  */
 641           && (LOOP_VINFO_LOOP (loop_vinfo) != loop
 642               || !vect_is_nonlinear_iv_evolution (loop, stmt_vinfo,
 643                                                   phi, &init, &step)))
 644         {
 645           worklist.safe_push (stmt_vinfo);
 646           continue;
 647         }
 648
 649       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 650                   != NULL_TREE);
 651       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 652
 653       if (dump_enabled_p ())
 654         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 655       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 656     }
 657
 658
 659   /* Second - identify all reductions and nested cycles.  */
 660   while (worklist.length () > 0)
 661     {
 662       stmt_vec_info stmt_vinfo = worklist.pop ();
 663       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
 664       tree def = PHI_RESULT (phi);
 665
 666       if (dump_enabled_p ())
 667         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
 668                          (gimple *) phi);
 669
 670       gcc_assert (!virtual_operand_p (def)
 671                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 672
 673       stmt_vec_info reduc_stmt_info
 674         = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
 675                                     &reduc_chain, slp);
 676       if (reduc_stmt_info)
 677         {
 678           STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
 679           STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
 680           if (double_reduc)
 681             {
 682               if (dump_enabled_p ())
 683                 dump_printf_loc (MSG_NOTE, vect_location,
 684                                  "Detected double reduction.\n");
 685
 686               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 687               STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
 688             }
 689           else
 690             {
 691               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 692                 {
 693                   if (dump_enabled_p ())
 694                     dump_printf_loc (MSG_NOTE, vect_location,
 695                                      "Detected vectorizable nested cycle.\n");
 696
 697                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 698                 }
 699               else
 700                 {
 701                   if (dump_enabled_p ())
 702                     dump_printf_loc (MSG_NOTE, vect_location,
 703                                      "Detected reduction.\n");
 704
 705                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 706                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
 707                   /* Store the reduction cycles for possible vectorization in
 708                      loop-aware SLP if it was not detected as reduction
 709                      chain.  */
 710                   if (! reduc_chain)
 711                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
 712                       (reduc_stmt_info);
 713                 }
 714             }
 715         }
 716       else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi))
 717         STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence;
 718       else
 719         if (dump_enabled_p ())
 720           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 721                            "Unknown def-use cycle pattern.\n");
 722     }
 723 }
 724
 725
 726 /* Function vect_analyze_scalar_cycles.
 727
 728    Examine the cross iteration def-use cycles of scalar variables, by
 729    analyzing the loop-header PHIs of scalar variables.  Classify each
 730    cycle as one of the following: invariant, induction, reduction, unknown.
 731    We do that for the loop represented by LOOP_VINFO, and also to its
 732    inner-loop, if exists.
 733    Examples for scalar cycles:
 734
 735    Example1: reduction:
 736
 737               loop1:
 738               for (i=0; i<N; i++)
 739                  sum += a[i];
 740
 741    Example2: induction:
 742
 743               loop2:
 744               for (i=0; i<N; i++)
 745                  a[i] = i;  */
 746
 747 static void
 748 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo, bool slp)
 749 {
 750   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 751
 752   vect_analyze_scalar_cycles_1 (loop_vinfo, loop, slp);
 753
 754   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 755      Reductions in such inner-loop therefore have different properties than
 756      the reductions in the nest that gets vectorized:
 757      1. When vectorized, they are executed in the same order as in the original
 758         scalar loop, so we can't change the order of computation when
 759         vectorizing them.
 760      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 761         current checks are too strict.  */
 762
 763   if (loop->inner)
 764     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner, slp);
 765 }
 766
 767 /* Transfer group and reduction information from STMT_INFO to its
 768    pattern stmt.  */
 769
 770 static void
 771 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
 772 {
 773   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
 774   stmt_vec_info stmtp;
 775   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
 776               && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
 777   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
 778   do
 779     {
 780       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
 781       gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
 782                            == STMT_VINFO_DEF_TYPE (stmt_info));
 783       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
 784       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
 785       if (stmt_info)
 786         REDUC_GROUP_NEXT_ELEMENT (stmtp)
 787           = STMT_VINFO_RELATED_STMT (stmt_info);
 788     }
 789   while (stmt_info);
 790 }
 791
 792 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 793
 794 static void
 795 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 796 {
 797   stmt_vec_info first;
 798   unsigned i;
 799
 800   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 801     {
 802       stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
 803       while (next)
 804         {
 805           if ((STMT_VINFO_IN_PATTERN_P (next)
 806                != STMT_VINFO_IN_PATTERN_P (first))
 807               || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
 808             break;
 809           next = REDUC_GROUP_NEXT_ELEMENT (next);
 810         }
 811       /* If all reduction chain members are well-formed patterns adjust
 812          the group to group the pattern stmts instead.  */
 813       if (! next
 814           && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
 815         {
 816           if (STMT_VINFO_IN_PATTERN_P (first))
 817             {
 818               vect_fixup_reduc_chain (first);
 819               LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 820                 = STMT_VINFO_RELATED_STMT (first);
 821             }
 822         }
 823       /* If not all stmt in the chain are patterns or if we failed
 824          to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
 825          it as regular reduction instead.  */
 826       else
 827         {
 828           stmt_vec_info vinfo = first;
 829           stmt_vec_info last = NULL;
 830           while (vinfo)
 831             {
 832               next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
 833               REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
 834               REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
 835               last = vinfo;
 836               vinfo = next;
 837             }
 838           STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
 839             = vect_internal_def;
 840           loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
 841           LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
 842           --i;
 843         }
 844     }
 845 }
 846
 847 /* Function vect_get_loop_niters.
 848
 849    Determine how many iterations the loop is executed and place it
 850    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 851    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 852    niter information holds in ASSUMPTIONS.
 853
 854    Return the loop exit conditions.  */
 855
 856
 857 static vec<gcond *>
 858 vect_get_loop_niters (class loop *loop, const_edge main_exit, tree *assumptions,
 859                       tree *number_of_iterations, tree *number_of_iterationsm1)
 860 {
 861   auto_vec<edge> exits = get_loop_exit_edges (loop);
 862   vec<gcond *> conds;
 863   conds.create (exits.length ());
 864   class tree_niter_desc niter_desc;
 865   tree niter_assumptions, niter, may_be_zero;
 866
 867   *assumptions = boolean_true_node;
 868   *number_of_iterationsm1 = chrec_dont_know;
 869   *number_of_iterations = chrec_dont_know;
 870
 871   DUMP_VECT_SCOPE ("get_loop_niters");
 872
 873   if (exits.is_empty ())
 874     return conds;
 875
 876   if (dump_enabled_p ())
 877     dump_printf_loc (MSG_NOTE, vect_location, "Loop has %d exits.\n",
 878                      exits.length ());
 879
 880   edge exit;
 881   unsigned int i;
 882   FOR_EACH_VEC_ELT (exits, i, exit)
 883     {
 884       gcond *cond = get_loop_exit_condition (exit);
 885       if (cond)
 886         conds.safe_push (cond);
 887
 888       if (dump_enabled_p ())
 889         dump_printf_loc (MSG_NOTE, vect_location, "Analyzing exit %d...\n", i);
 890
 891       if (exit != main_exit)
 892         continue;
 893
 894       may_be_zero = NULL_TREE;
 895       if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 896           || chrec_contains_undetermined (niter_desc.niter))
 897         continue;
 898
 899       niter_assumptions = niter_desc.assumptions;
 900       may_be_zero = niter_desc.may_be_zero;
 901       niter = niter_desc.niter;
 902
 903       if (may_be_zero && integer_zerop (may_be_zero))
 904         may_be_zero = NULL_TREE;
 905
 906       if (may_be_zero)
 907         {
 908           if (COMPARISON_CLASS_P (may_be_zero))
 909             {
 910               /* Try to combine may_be_zero with assumptions, this can simplify
 911                  computation of niter expression.  */
 912               if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 913                 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 914                                                  niter_assumptions,
 915                                                  fold_build1 (TRUTH_NOT_EXPR,
 916                                                               boolean_type_node,
 917                                                               may_be_zero));
 918               else
 919                 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 920                                      build_int_cst (TREE_TYPE (niter), 0),
 921                                      rewrite_to_non_trapping_overflow (niter));
 922
 923               may_be_zero = NULL_TREE;
 924             }
 925           else if (integer_nonzerop (may_be_zero))
 926             {
 927               *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 928               *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 929               continue;
 930             }
 931           else
 932             continue;
 933        }
 934
 935       /* Loop assumptions are based off the normal exit.  */
 936       *assumptions = niter_assumptions;
 937       *number_of_iterationsm1 = niter;
 938
 939       /* We want the number of loop header executions which is the number
 940          of latch executions plus one.
 941          ???  For UINT_MAX latch executions this number overflows to zero
 942          for loops like do { n++; } while (n != 0);  */
 943       if (niter && !chrec_contains_undetermined (niter))
 944         {
 945           niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter),
 946                                unshare_expr (niter),
 947                                build_int_cst (TREE_TYPE (niter), 1));
 948           if (TREE_CODE (niter) == INTEGER_CST
 949               && TREE_CODE (*number_of_iterationsm1) != INTEGER_CST)
 950             {
 951               /* If we manage to fold niter + 1 into INTEGER_CST even when
 952                  niter is some complex expression, ensure back
 953                  *number_of_iterationsm1 is an INTEGER_CST as well.  See
 954                  PR113210.  */
 955               *number_of_iterationsm1
 956                 = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), niter,
 957                                build_minus_one_cst (TREE_TYPE (niter)));
 958             }
 959         }
 960       *number_of_iterations = niter;
 961     }
 962
 963   if (dump_enabled_p ())
 964     dump_printf_loc (MSG_NOTE, vect_location, "All loop exits successfully analyzed.\n");
 965
 966   return conds;
 967 }
 968
 969 /*  Determine the main loop exit for the vectorizer.  */
 970
 971 edge
 972 vec_init_loop_exit_info (class loop *loop)
 973 {
 974   /* Before we begin we must first determine which exit is the main one and
 975      which are auxilary exits.  */
 976   auto_vec<edge> exits = get_loop_exit_edges (loop);
 977   if (exits.length () == 1)
 978     return exits[0];
 979
 980   /* If we have multiple exits we only support counting IV at the moment.  Analyze
 981      all exits and return one */
 982   class tree_niter_desc niter_desc;
 983   edge candidate = NULL;
 984   for (edge exit : exits)
 985     {
 986       if (!get_loop_exit_condition (exit))
 987         continue;
 988
 989       if (number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 990           && !chrec_contains_undetermined (niter_desc.niter))
 991         {
 992           if (!niter_desc.may_be_zero || !candidate)
 993             candidate = exit;
 994         }
 995     }
 996
 997   return candidate;
 998 }
 999
1000 /* Function bb_in_loop_p
1001
1002    Used as predicate for dfs order traversal of the loop bbs.  */
1003
1004 static bool
1005 bb_in_loop_p (const_basic_block bb, const void *data)
1006 {
1007   const class loop *const loop = (const class loop *)data;
1008   if (flow_bb_inside_loop_p (loop, bb))
1009     return true;
1010   return false;
1011 }
1012
1013
1014 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1015    stmt_vec_info structs for all the stmts in LOOP_IN.  */
1016
1017 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
1018   : vec_info (vec_info::loop, shared),
1019     loop (loop_in),
1020     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
1021     num_itersm1 (NULL_TREE),
1022     num_iters (NULL_TREE),
1023     num_iters_unchanged (NULL_TREE),
1024     num_iters_assumptions (NULL_TREE),
1025     vector_costs (nullptr),
1026     scalar_costs (nullptr),
1027     th (0),
1028     versioning_threshold (0),
1029     vectorization_factor (0),
1030     main_loop_edge (nullptr),
1031     skip_main_loop_edge (nullptr),
1032     skip_this_loop_edge (nullptr),
1033     reusable_accumulators (),
1034     suggested_unroll_factor (1),
1035     max_vectorization_factor (0),
1036     mask_skip_niters (NULL_TREE),
1037     rgroup_compare_type (NULL_TREE),
1038     simd_if_cond (NULL_TREE),
1039     partial_vector_style (vect_partial_vectors_none),
1040     unaligned_dr (NULL),
1041     peeling_for_alignment (0),
1042     ptr_mask (0),
1043     ivexpr_map (NULL),
1044     scan_map (NULL),
1045     slp_unrolling_factor (1),
1046     inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
1047     vectorizable (false),
1048     can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
1049     using_partial_vectors_p (false),
1050     using_decrementing_iv_p (false),
1051     using_select_vl_p (false),
1052     epil_using_partial_vectors_p (false),
1053     partial_load_store_bias (0),
1054     peeling_for_gaps (false),
1055     peeling_for_niter (false),
1056     early_breaks (false),
1057     no_data_dependencies (false),
1058     has_mask_store (false),
1059     scalar_loop_scaling (profile_probability::uninitialized ()),
1060     scalar_loop (NULL),
1061     orig_loop_info (NULL),
1062     vec_loop_iv_exit (NULL),
1063     vec_epilogue_loop_iv_exit (NULL),
1064     scalar_loop_iv_exit (NULL)
1065 {
1066   /* CHECKME: We want to visit all BBs before their successors (except for
1067      latch blocks, for which this assertion wouldn't hold).  In the simple
1068      case of the loop forms we allow, a dfs order of the BBs would the same
1069      as reversed postorder traversal, so we are safe.  */
1070
1071   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1072                                           bbs, loop->num_nodes, loop);
1073   gcc_assert (nbbs == loop->num_nodes);
1074
1075   for (unsigned int i = 0; i < nbbs; i++)
1076     {
1077       basic_block bb = bbs[i];
1078       gimple_stmt_iterator si;
1079
1080       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1081         {
1082           gimple *phi = gsi_stmt (si);
1083           gimple_set_uid (phi, 0);
1084           add_stmt (phi);
1085         }
1086
1087       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1088         {
1089           gimple *stmt = gsi_stmt (si);
1090           gimple_set_uid (stmt, 0);
1091           if (is_gimple_debug (stmt))
1092             continue;
1093           add_stmt (stmt);
1094           /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
1095              third argument is the #pragma omp simd if (x) condition, when 0,
1096              loop shouldn't be vectorized, when non-zero constant, it should
1097              be vectorized normally, otherwise versioned with vectorized loop
1098              done if the condition is non-zero at runtime.  */
1099           if (loop_in->simduid
1100               && is_gimple_call (stmt)
1101               && gimple_call_internal_p (stmt)
1102               && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
1103               && gimple_call_num_args (stmt) >= 3
1104               && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
1105               && (loop_in->simduid
1106                   == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
1107             {
1108               tree arg = gimple_call_arg (stmt, 2);
1109               if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
1110                 simd_if_cond = arg;
1111               else
1112                 gcc_assert (integer_nonzerop (arg));
1113             }
1114         }
1115     }
1116
1117   epilogue_vinfos.create (6);
1118 }
1119
1120 /* Free all levels of rgroup CONTROLS.  */
1121
1122 void
1123 release_vec_loop_controls (vec<rgroup_controls> *controls)
1124 {
1125   rgroup_controls *rgc;
1126   unsigned int i;
1127   FOR_EACH_VEC_ELT (*controls, i, rgc)
1128     rgc->controls.release ();
1129   controls->release ();
1130 }
1131
1132 /* Free all memory used by the _loop_vec_info, as well as all the
1133    stmt_vec_info structs of all the stmts in the loop.  */
1134
1135 _loop_vec_info::~_loop_vec_info ()
1136 {
1137   free (bbs);
1138
1139   release_vec_loop_controls (&masks.rgc_vec);
1140   release_vec_loop_controls (&lens);
1141   delete ivexpr_map;
1142   delete scan_map;
1143   epilogue_vinfos.release ();
1144   delete scalar_costs;
1145   delete vector_costs;
1146
1147   /* When we release an epiloge vinfo that we do not intend to use
1148      avoid clearing AUX of the main loop which should continue to
1149      point to the main loop vinfo since otherwise we'll leak that.  */
1150   if (loop->aux == this)
1151     loop->aux = NULL;
1152 }
1153
1154 /* Return an invariant or register for EXPR and emit necessary
1155    computations in the LOOP_VINFO loop preheader.  */
1156
1157 tree
1158 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
1159 {
1160   if (is_gimple_reg (expr)
1161       || is_gimple_min_invariant (expr))
1162     return expr;
1163
1164   if (! loop_vinfo->ivexpr_map)
1165     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
1166   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
1167   if (! cached)
1168     {
1169       gimple_seq stmts = NULL;
1170       cached = force_gimple_operand (unshare_expr (expr),
1171                                      &stmts, true, NULL_TREE);
1172       if (stmts)
1173         {
1174           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1175           gsi_insert_seq_on_edge_immediate (e, stmts);
1176         }
1177     }
1178   return cached;
1179 }
1180
1181 /* Return true if we can use CMP_TYPE as the comparison type to produce
1182    all masks required to mask LOOP_VINFO.  */
1183
1184 static bool
1185 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1186 {
1187   rgroup_controls *rgm;
1188   unsigned int i;
1189   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1190     if (rgm->type != NULL_TREE
1191         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1192                                             cmp_type, rgm->type,
1193                                             OPTIMIZE_FOR_SPEED))
1194       return false;
1195   return true;
1196 }
1197
1198 /* Calculate the maximum number of scalars per iteration for every
1199    rgroup in LOOP_VINFO.  */
1200
1201 static unsigned int
1202 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1203 {
1204   unsigned int res = 1;
1205   unsigned int i;
1206   rgroup_controls *rgm;
1207   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1208     res = MAX (res, rgm->max_nscalars_per_iter);
1209   return res;
1210 }
1211
1212 /* Calculate the minimum precision necessary to represent:
1213
1214       MAX_NITERS * FACTOR
1215
1216    as an unsigned integer, where MAX_NITERS is the maximum number of
1217    loop header iterations for the original scalar form of LOOP_VINFO.  */
1218
1219 static unsigned
1220 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1221 {
1222   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1223
1224   /* Get the maximum number of iterations that is representable
1225      in the counter type.  */
1226   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1227   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1228
1229   /* Get a more refined estimate for the number of iterations.  */
1230   widest_int max_back_edges;
1231   if (max_loop_iterations (loop, &max_back_edges))
1232     max_ni = wi::smin (max_ni, max_back_edges + 1);
1233
1234   /* Work out how many bits we need to represent the limit.  */
1235   return wi::min_precision (max_ni * factor, UNSIGNED);
1236 }
1237
1238 /* True if the loop needs peeling or partial vectors when vectorized.  */
1239
1240 static bool
1241 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1242 {
1243   unsigned HOST_WIDE_INT const_vf;
1244   HOST_WIDE_INT max_niter
1245     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1246
1247   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1248   if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1249     th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1250                                           (loop_vinfo));
1251
1252   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1253       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1254     {
1255       /* Work out the (constant) number of iterations that need to be
1256          peeled for reasons other than niters.  */
1257       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1258       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1259         peel_niter += 1;
1260       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1261                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1262         return true;
1263     }
1264   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1265       /* ??? When peeling for gaps but not alignment, we could
1266          try to check whether the (variable) niters is known to be
1267          VF * N + 1.  That's something of a niche case though.  */
1268       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1269       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1270       || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1271            < (unsigned) exact_log2 (const_vf))
1272           /* In case of versioning, check if the maximum number of
1273              iterations is greater than th.  If they are identical,
1274              the epilogue is unnecessary.  */
1275           && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1276               || ((unsigned HOST_WIDE_INT) max_niter
1277                   /* We'd like to use LOOP_VINFO_VERSIONING_THRESHOLD
1278                      but that's only computed later based on our result.
1279                      The following is the most conservative approximation.  */
1280                   > (std::max ((unsigned HOST_WIDE_INT) th,
1281                                const_vf) / const_vf) * const_vf))))
1282     return true;
1283
1284   return false;
1285 }
1286
1287 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1288    whether we can actually generate the masks required.  Return true if so,
1289    storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE.  */
1290
1291 static bool
1292 vect_verify_full_masking (loop_vec_info loop_vinfo)
1293 {
1294   unsigned int min_ni_width;
1295
1296   /* Use a normal loop if there are no statements that need masking.
1297      This only happens in rare degenerate cases: it means that the loop
1298      has no loads, no stores, and no live-out values.  */
1299   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1300     return false;
1301
1302   /* Produce the rgroup controls.  */
1303   for (auto mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1304     {
1305       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1306       tree vectype = mask.first;
1307       unsigned nvectors = mask.second;
1308
1309       if (masks->rgc_vec.length () < nvectors)
1310         masks->rgc_vec.safe_grow_cleared (nvectors, true);
1311       rgroup_controls *rgm = &(*masks).rgc_vec[nvectors - 1];
1312       /* The number of scalars per iteration and the number of vectors are
1313          both compile-time constants.  */
1314       unsigned int nscalars_per_iter
1315           = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1316                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1317
1318       if (rgm->max_nscalars_per_iter < nscalars_per_iter)
1319         {
1320           rgm->max_nscalars_per_iter = nscalars_per_iter;
1321           rgm->type = truth_type_for (vectype);
1322           rgm->factor = 1;
1323         }
1324     }
1325
1326   unsigned int max_nscalars_per_iter
1327     = vect_get_max_nscalars_per_iter (loop_vinfo);
1328
1329   /* Work out how many bits we need to represent the limit.  */
1330   min_ni_width
1331     = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1332
1333   /* Find a scalar mode for which WHILE_ULT is supported.  */
1334   opt_scalar_int_mode cmp_mode_iter;
1335   tree cmp_type = NULL_TREE;
1336   tree iv_type = NULL_TREE;
1337   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1338   unsigned int iv_precision = UINT_MAX;
1339
1340   if (iv_limit != -1)
1341     iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1342                                       UNSIGNED);
1343
1344   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1345     {
1346       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1347       if (cmp_bits >= min_ni_width
1348           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1349         {
1350           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1351           if (this_type
1352               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1353             {
1354               /* Although we could stop as soon as we find a valid mode,
1355                  there are at least two reasons why that's not always the
1356                  best choice:
1357
1358                  - An IV that's Pmode or wider is more likely to be reusable
1359                    in address calculations than an IV that's narrower than
1360                    Pmode.
1361
1362                  - Doing the comparison in IV_PRECISION or wider allows
1363                    a natural 0-based IV, whereas using a narrower comparison
1364                    type requires mitigations against wrap-around.
1365
1366                  Conversely, if the IV limit is variable, doing the comparison
1367                  in a wider type than the original type can introduce
1368                  unnecessary extensions, so picking the widest valid mode
1369                  is not always a good choice either.
1370
1371                  Here we prefer the first IV type that's Pmode or wider,
1372                  and the first comparison type that's IV_PRECISION or wider.
1373                  (The comparison type must be no wider than the IV type,
1374                  to avoid extensions in the vector loop.)
1375
1376                  ??? We might want to try continuing beyond Pmode for ILP32
1377                  targets if CMP_BITS < IV_PRECISION.  */
1378               iv_type = this_type;
1379               if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1380                 cmp_type = this_type;
1381               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1382                 break;
1383             }
1384         }
1385     }
1386
1387   if (!cmp_type)
1388     {
1389       LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.release ();
1390       return false;
1391     }
1392
1393   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1394   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1395   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_while_ult;
1396   return true;
1397 }
1398
1399 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1400    whether we can actually generate AVX512 style masks.  Return true if so,
1401    storing the type of the scalar IV in LOOP_VINFO_RGROUP_IV_TYPE.  */
1402
1403 static bool
1404 vect_verify_full_masking_avx512 (loop_vec_info loop_vinfo)
1405 {
1406   /* Produce differently organized rgc_vec and differently check
1407      we can produce masks.  */
1408
1409   /* Use a normal loop if there are no statements that need masking.
1410      This only happens in rare degenerate cases: it means that the loop
1411      has no loads, no stores, and no live-out values.  */
1412   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1413     return false;
1414
1415   /* For the decrementing IV we need to represent all values in
1416      [0, niter + niter_skip] where niter_skip is the elements we
1417      skip in the first iteration for prologue peeling.  */
1418   tree iv_type = NULL_TREE;
1419   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1420   unsigned int iv_precision = UINT_MAX;
1421   if (iv_limit != -1)
1422     iv_precision = wi::min_precision (iv_limit, UNSIGNED);
1423
1424   /* First compute the type for the IV we use to track the remaining
1425      scalar iterations.  */
1426   opt_scalar_int_mode cmp_mode_iter;
1427   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1428     {
1429       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1430       if (cmp_bits >= iv_precision
1431           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1432         {
1433           iv_type = build_nonstandard_integer_type (cmp_bits, true);
1434           if (iv_type)
1435             break;
1436         }
1437     }
1438   if (!iv_type)
1439     return false;
1440
1441   /* Produce the rgroup controls.  */
1442   for (auto const &mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1443     {
1444       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1445       tree vectype = mask.first;
1446       unsigned nvectors = mask.second;
1447
1448       /* The number of scalars per iteration and the number of vectors are
1449          both compile-time constants.  */
1450       unsigned int nscalars_per_iter
1451         = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1452                      LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1453
1454       /* We index the rgroup_controls vector with nscalars_per_iter
1455          which we keep constant and instead have a varying nvectors,
1456          remembering the vector mask with the fewest nV.  */
1457       if (masks->rgc_vec.length () < nscalars_per_iter)
1458         masks->rgc_vec.safe_grow_cleared (nscalars_per_iter, true);
1459       rgroup_controls *rgm = &(*masks).rgc_vec[nscalars_per_iter - 1];
1460
1461       if (!rgm->type || rgm->factor > nvectors)
1462         {
1463           rgm->type = truth_type_for (vectype);
1464           rgm->compare_type = NULL_TREE;
1465           rgm->max_nscalars_per_iter = nscalars_per_iter;
1466           rgm->factor = nvectors;
1467           rgm->bias_adjusted_ctrl = NULL_TREE;
1468         }
1469     }
1470
1471   /* There is no fixed compare type we are going to use but we have to
1472      be able to get at one for each mask group.  */
1473   unsigned int min_ni_width
1474     = wi::min_precision (vect_max_vf (loop_vinfo), UNSIGNED);
1475
1476   bool ok = true;
1477   for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
1478     {
1479       tree mask_type = rgc.type;
1480       if (!mask_type)
1481         continue;
1482
1483       /* For now vect_get_loop_mask only supports integer mode masks
1484          when we need to split it.  */
1485       if (GET_MODE_CLASS (TYPE_MODE (mask_type)) != MODE_INT
1486           || TYPE_PRECISION (TREE_TYPE (mask_type)) != 1)
1487         {
1488           ok = false;
1489           break;
1490         }
1491
1492       /* If iv_type is usable as compare type use that - we can elide the
1493          saturation in that case.   */
1494       if (TYPE_PRECISION (iv_type) >= min_ni_width)
1495         {
1496           tree cmp_vectype
1497             = build_vector_type (iv_type, TYPE_VECTOR_SUBPARTS (mask_type));
1498           if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1499             rgc.compare_type = cmp_vectype;
1500         }
1501       if (!rgc.compare_type)
1502         FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1503           {
1504             unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1505             if (cmp_bits >= min_ni_width
1506                 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1507               {
1508                 tree cmp_type = build_nonstandard_integer_type (cmp_bits, true);
1509                 if (!cmp_type)
1510                   continue;
1511
1512                 /* Check whether we can produce the mask with cmp_type.  */
1513                 tree cmp_vectype
1514                   = build_vector_type (cmp_type, TYPE_VECTOR_SUBPARTS (mask_type));
1515                 if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1516                   {
1517                     rgc.compare_type = cmp_vectype;
1518                     break;
1519                   }
1520               }
1521         }
1522       if (!rgc.compare_type)
1523         {
1524           ok = false;
1525           break;
1526         }
1527     }
1528   if (!ok)
1529     {
1530       release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
1531       return false;
1532     }
1533
1534   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = error_mark_node;
1535   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1536   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_avx512;
1537   return true;
1538 }
1539
1540 /* Check whether we can use vector access with length based on precison
1541    comparison.  So far, to keep it simple, we only allow the case that the
1542    precision of the target supported length is larger than the precision
1543    required by loop niters.  */
1544
1545 static bool
1546 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1547 {
1548   if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1549     return false;
1550
1551   machine_mode len_load_mode, len_store_mode;
1552   if (!get_len_load_store_mode (loop_vinfo->vector_mode, true)
1553          .exists (&len_load_mode))
1554     return false;
1555   if (!get_len_load_store_mode (loop_vinfo->vector_mode, false)
1556          .exists (&len_store_mode))
1557     return false;
1558
1559   signed char partial_load_bias = internal_len_load_store_bias
1560     (IFN_LEN_LOAD, len_load_mode);
1561
1562   signed char partial_store_bias = internal_len_load_store_bias
1563     (IFN_LEN_STORE, len_store_mode);
1564
1565   gcc_assert (partial_load_bias == partial_store_bias);
1566
1567   if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1568     return false;
1569
1570   /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1571      len_loads with a length of zero.  In order to avoid that we prohibit
1572      more than one loop length here.  */
1573   if (partial_load_bias == -1
1574       && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1575     return false;
1576
1577   LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1578
1579   unsigned int max_nitems_per_iter = 1;
1580   unsigned int i;
1581   rgroup_controls *rgl;
1582   /* Find the maximum number of items per iteration for every rgroup.  */
1583   FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1584     {
1585       unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1586       max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1587     }
1588
1589   /* Work out how many bits we need to represent the length limit.  */
1590   unsigned int min_ni_prec
1591     = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1592
1593   /* Now use the maximum of below precisions for one suitable IV type:
1594      - the IV's natural precision
1595      - the precision needed to hold: the maximum number of scalar
1596        iterations multiplied by the scale factor (min_ni_prec above)
1597      - the Pmode precision
1598
1599      If min_ni_prec is less than the precision of the current niters,
1600      we perfer to still use the niters type.  Prefer to use Pmode and
1601      wider IV to avoid narrow conversions.  */
1602
1603   unsigned int ni_prec
1604     = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1605   min_ni_prec = MAX (min_ni_prec, ni_prec);
1606   min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1607
1608   tree iv_type = NULL_TREE;
1609   opt_scalar_int_mode tmode_iter;
1610   FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1611     {
1612       scalar_mode tmode = tmode_iter.require ();
1613       unsigned int tbits = GET_MODE_BITSIZE (tmode);
1614
1615       /* ??? Do we really want to construct one IV whose precision exceeds
1616          BITS_PER_WORD?  */
1617       if (tbits > BITS_PER_WORD)
1618         break;
1619
1620       /* Find the first available standard integral type.  */
1621       if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1622         {
1623           iv_type = build_nonstandard_integer_type (tbits, true);
1624           break;
1625         }
1626     }
1627
1628   if (!iv_type)
1629     {
1630       if (dump_enabled_p ())
1631         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1632                          "can't vectorize with length-based partial vectors"
1633                          " because there is no suitable iv type.\n");
1634       return false;
1635     }
1636
1637   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1638   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1639   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_len;
1640
1641   return true;
1642 }
1643
1644 /* Calculate the cost of one scalar iteration of the loop.  */
1645 static void
1646 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1647 {
1648   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1649   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1650   int nbbs = loop->num_nodes, factor;
1651   int innerloop_iters, i;
1652
1653   DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1654
1655   /* Gather costs for statements in the scalar loop.  */
1656
1657   /* FORNOW.  */
1658   innerloop_iters = 1;
1659   if (loop->inner)
1660     innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1661
1662   for (i = 0; i < nbbs; i++)
1663     {
1664       gimple_stmt_iterator si;
1665       basic_block bb = bbs[i];
1666
1667       if (bb->loop_father == loop->inner)
1668         factor = innerloop_iters;
1669       else
1670         factor = 1;
1671
1672       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1673         {
1674           gimple *stmt = gsi_stmt (si);
1675           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1676
1677           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1678             continue;
1679
1680           /* Skip stmts that are not vectorized inside the loop.  */
1681           stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1682           if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1683               && (!STMT_VINFO_LIVE_P (vstmt_info)
1684                   || !VECTORIZABLE_CYCLE_DEF
1685                         (STMT_VINFO_DEF_TYPE (vstmt_info))))
1686             continue;
1687
1688           vect_cost_for_stmt kind;
1689           if (STMT_VINFO_DATA_REF (stmt_info))
1690             {
1691               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1692                kind = scalar_load;
1693              else
1694                kind = scalar_store;
1695             }
1696           else if (vect_nop_conversion_p (stmt_info))
1697             continue;
1698           else
1699             kind = scalar_stmt;
1700
1701           /* We are using vect_prologue here to avoid scaling twice
1702              by the inner loop factor.  */
1703           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1704                             factor, kind, stmt_info, 0, vect_prologue);
1705         }
1706     }
1707
1708   /* Now accumulate cost.  */
1709   loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1710   add_stmt_costs (loop_vinfo->scalar_costs,
1711                   &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1712   loop_vinfo->scalar_costs->finish_cost (nullptr);
1713 }
1714
1715 /* Function vect_analyze_loop_form.
1716
1717    Verify that certain CFG restrictions hold, including:
1718    - the loop has a pre-header
1719    - the loop has a single entry
1720    - nested loops can have only a single exit.
1721    - the loop exit condition is simple enough
1722    - the number of iterations can be analyzed, i.e, a countable loop.  The
1723      niter could be analyzed under some assumptions.  */
1724
1725 opt_result
1726 vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info)
1727 {
1728   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1729
1730   edge exit_e = vec_init_loop_exit_info (loop);
1731   if (!exit_e)
1732     return opt_result::failure_at (vect_location,
1733                                    "not vectorized:"
1734                                    " could not determine main exit from"
1735                                    " loop with multiple exits.\n");
1736   info->loop_exit = exit_e;
1737   if (dump_enabled_p ())
1738       dump_printf_loc (MSG_NOTE, vect_location,
1739                        "using as main loop exit: %d -> %d [AUX: %p]\n",
1740                        exit_e->src->index, exit_e->dest->index, exit_e->aux);
1741
1742   /* Check if we have any control flow that doesn't leave the loop.  */
1743   class loop *v_loop = loop->inner ? loop->inner : loop;
1744   basic_block *bbs= get_loop_body (v_loop);
1745   for (unsigned i = 0; i < v_loop->num_nodes; i++)
1746     if (EDGE_COUNT (bbs[i]->succs) != 1
1747         && (EDGE_COUNT (bbs[i]->succs) != 2
1748             || !loop_exits_from_bb_p (bbs[i]->loop_father, bbs[i])))
1749       return opt_result::failure_at (vect_location,
1750                                      "not vectorized:"
1751                                      " unsupported control flow in loop.\n");
1752
1753   /* Different restrictions apply when we are considering an inner-most loop,
1754      vs. an outer (nested) loop.
1755      (FORNOW. May want to relax some of these restrictions in the future).  */
1756
1757   info->inner_loop_cond = NULL;
1758   if (!loop->inner)
1759     {
1760       /* Inner-most loop.  We currently require that the number of BBs is
1761          exactly 2 (the header and latch).  Vectorizable inner-most loops
1762          look like this:
1763
1764                         (pre-header)
1765                            |
1766                           header <--------+
1767                            | |            |
1768                            | +--> latch --+
1769                            |
1770                         (exit-bb)  */
1771
1772       if (empty_block_p (loop->header))
1773         return opt_result::failure_at (vect_location,
1774                                        "not vectorized: empty loop.\n");
1775     }
1776   else
1777     {
1778       class loop *innerloop = loop->inner;
1779       edge entryedge;
1780
1781       /* Nested loop. We currently require that the loop is doubly-nested,
1782          contains a single inner loop, and the number of BBs is exactly 5.
1783          Vectorizable outer-loops look like this:
1784
1785                         (pre-header)
1786                            |
1787                           header <---+
1788                            |         |
1789                           inner-loop |
1790                            |         |
1791                           tail ------+
1792                            |
1793                         (exit-bb)
1794
1795          The inner-loop has the properties expected of inner-most loops
1796          as described above.  */
1797
1798       if ((loop->inner)->inner || (loop->inner)->next)
1799         return opt_result::failure_at (vect_location,
1800                                        "not vectorized:"
1801                                        " multiple nested loops.\n");
1802
1803       entryedge = loop_preheader_edge (innerloop);
1804       if (entryedge->src != loop->header
1805           || !single_exit (innerloop)
1806           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1807         return opt_result::failure_at (vect_location,
1808                                        "not vectorized:"
1809                                        " unsupported outerloop form.\n");
1810
1811       /* Analyze the inner-loop.  */
1812       vect_loop_form_info inner;
1813       opt_result res = vect_analyze_loop_form (loop->inner, &inner);
1814       if (!res)
1815         {
1816           if (dump_enabled_p ())
1817             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1818                              "not vectorized: Bad inner loop.\n");
1819           return res;
1820         }
1821
1822       /* Don't support analyzing niter under assumptions for inner
1823          loop.  */
1824       if (!integer_onep (inner.assumptions))
1825         return opt_result::failure_at (vect_location,
1826                                        "not vectorized: Bad inner loop.\n");
1827
1828       if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1829         return opt_result::failure_at (vect_location,
1830                                        "not vectorized: inner-loop count not"
1831                                        " invariant.\n");
1832
1833       if (dump_enabled_p ())
1834         dump_printf_loc (MSG_NOTE, vect_location,
1835                          "Considering outer-loop vectorization.\n");
1836       info->inner_loop_cond = inner.conds[0];
1837     }
1838
1839   if (EDGE_COUNT (loop->header->preds) != 2)
1840     return opt_result::failure_at (vect_location,
1841                                    "not vectorized:"
1842                                    " too many incoming edges.\n");
1843
1844   /* We assume that the loop exit condition is at the end of the loop. i.e,
1845      that the loop is represented as a do-while (with a proper if-guard
1846      before the loop if needed), where the loop header contains all the
1847      executable statements, and the latch is empty.  */
1848   if (!empty_block_p (loop->latch)
1849       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1850     return opt_result::failure_at (vect_location,
1851                                    "not vectorized: latch block not empty.\n");
1852
1853   /* Make sure the exit is not abnormal.  */
1854   auto_vec<edge> exits = get_loop_exit_edges (loop);
1855   for (edge e : exits)
1856     {
1857       if (e->flags & EDGE_ABNORMAL)
1858         return opt_result::failure_at (vect_location,
1859                                        "not vectorized:"
1860                                        " abnormal loop exit edge.\n");
1861     }
1862
1863   info->conds
1864     = vect_get_loop_niters (loop, exit_e, &info->assumptions,
1865                             &info->number_of_iterations,
1866                             &info->number_of_iterationsm1);
1867
1868   if (info->conds.is_empty ())
1869     return opt_result::failure_at
1870       (vect_location,
1871        "not vectorized: complicated exit condition.\n");
1872
1873   /* Determine what the primary and alternate exit conds are.  */
1874   for (unsigned i = 0; i < info->conds.length (); i++)
1875     {
1876       gcond *cond = info->conds[i];
1877       if (exit_e->src == gimple_bb (cond))
1878         std::swap (info->conds[0], info->conds[i]);
1879     }
1880
1881   if (integer_zerop (info->assumptions)
1882       || !info->number_of_iterations
1883       || chrec_contains_undetermined (info->number_of_iterations))
1884     return opt_result::failure_at
1885       (info->conds[0],
1886        "not vectorized: number of iterations cannot be computed.\n");
1887
1888   if (integer_zerop (info->number_of_iterations))
1889     return opt_result::failure_at
1890       (info->conds[0],
1891        "not vectorized: number of iterations = 0.\n");
1892
1893   if (!(tree_fits_shwi_p (info->number_of_iterations)
1894         && tree_to_shwi (info->number_of_iterations) > 0))
1895     {
1896       if (dump_enabled_p ())
1897         {
1898           dump_printf_loc (MSG_NOTE, vect_location,
1899                            "Symbolic number of iterations is ");
1900           dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1901           dump_printf (MSG_NOTE, "\n");
1902         }
1903     }
1904
1905   return opt_result::success ();
1906 }
1907
1908 /* Create a loop_vec_info for LOOP with SHARED and the
1909    vect_analyze_loop_form result.  */
1910
1911 loop_vec_info
1912 vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1913                         const vect_loop_form_info *info,
1914                         loop_vec_info main_loop_info)
1915 {
1916   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1917   LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1918   LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1919   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1920   LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = main_loop_info;
1921   /* Also record the assumptions for versioning.  */
1922   if (!integer_onep (info->assumptions) && !main_loop_info)
1923     LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1924
1925   for (gcond *cond : info->conds)
1926     {
1927       stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (cond);
1928       STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1929       /* Mark the statement as a condition.  */
1930       STMT_VINFO_DEF_TYPE (loop_cond_info) = vect_condition_def;
1931     }
1932
1933   for (unsigned i = 1; i < info->conds.length (); i ++)
1934     LOOP_VINFO_LOOP_CONDS (loop_vinfo).safe_push (info->conds[i]);
1935   LOOP_VINFO_LOOP_IV_COND (loop_vinfo) = info->conds[0];
1936
1937   LOOP_VINFO_IV_EXIT (loop_vinfo) = info->loop_exit;
1938
1939   /* Check to see if we're vectorizing multiple exits.  */
1940   LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
1941     = !LOOP_VINFO_LOOP_CONDS (loop_vinfo).is_empty ();
1942
1943   if (info->inner_loop_cond)
1944     {
1945       stmt_vec_info inner_loop_cond_info
1946         = loop_vinfo->lookup_stmt (info->inner_loop_cond);
1947       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1948       /* If we have an estimate on the number of iterations of the inner
1949          loop use that to limit the scale for costing, otherwise use
1950          --param vect-inner-loop-cost-factor literally.  */
1951       widest_int nit;
1952       if (estimated_stmt_executions (loop->inner, &nit))
1953         LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1954           = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1955     }
1956
1957   return loop_vinfo;
1958 }
1959
1960
1961
1962 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1963    statements update the vectorization factor.  */
1964
1965 static void
1966 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1967 {
1968   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1969   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1970   int nbbs = loop->num_nodes;
1971   poly_uint64 vectorization_factor;
1972   int i;
1973
1974   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1975
1976   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1977   gcc_assert (known_ne (vectorization_factor, 0U));
1978
1979   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1980      vectorization factor of the loop is the unrolling factor required by
1981      the SLP instances.  If that unrolling factor is 1, we say, that we
1982      perform pure SLP on loop - cross iteration parallelism is not
1983      exploited.  */
1984   bool only_slp_in_loop = true;
1985   for (i = 0; i < nbbs; i++)
1986     {
1987       basic_block bb = bbs[i];
1988       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1989            gsi_next (&si))
1990         {
1991           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1992           if (!stmt_info)
1993             continue;
1994           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1995                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1996               && !PURE_SLP_STMT (stmt_info))
1997             /* STMT needs both SLP and loop-based vectorization.  */
1998             only_slp_in_loop = false;
1999         }
2000       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
2001            gsi_next (&si))
2002         {
2003           if (is_gimple_debug (gsi_stmt (si)))
2004             continue;
2005           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2006           stmt_info = vect_stmt_to_vectorize (stmt_info);
2007           if ((STMT_VINFO_RELEVANT_P (stmt_info)
2008                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2009               && !PURE_SLP_STMT (stmt_info))
2010             /* STMT needs both SLP and loop-based vectorization.  */
2011             only_slp_in_loop = false;
2012         }
2013     }
2014
2015   if (only_slp_in_loop)
2016     {
2017       if (dump_enabled_p ())
2018         dump_printf_loc (MSG_NOTE, vect_location,
2019                          "Loop contains only SLP stmts\n");
2020       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
2021     }
2022   else
2023     {
2024       if (dump_enabled_p ())
2025         dump_printf_loc (MSG_NOTE, vect_location,
2026                          "Loop contains SLP and non-SLP stmts\n");
2027       /* Both the vectorization factor and unroll factor have the form
2028          GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
2029          so they must have a common multiple.  */
2030       vectorization_factor
2031         = force_common_multiple (vectorization_factor,
2032                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
2033     }
2034
2035   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
2036   if (dump_enabled_p ())
2037     {
2038       dump_printf_loc (MSG_NOTE, vect_location,
2039                        "Updating vectorization factor to ");
2040       dump_dec (MSG_NOTE, vectorization_factor);
2041       dump_printf (MSG_NOTE, ".\n");
2042     }
2043 }
2044
2045 /* Return true if STMT_INFO describes a double reduction phi and if
2046    the other phi in the reduction is also relevant for vectorization.
2047    This rejects cases such as:
2048
2049       outer1:
2050         x_1 = PHI <x_3(outer2), ...>;
2051         ...
2052
2053       inner:
2054         x_2 = ...;
2055         ...
2056
2057       outer2:
2058         x_3 = PHI <x_2(inner)>;
2059
2060    if nothing in x_2 or elsewhere makes x_1 relevant.  */
2061
2062 static bool
2063 vect_active_double_reduction_p (stmt_vec_info stmt_info)
2064 {
2065   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2066     return false;
2067
2068   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
2069 }
2070
2071 /* Function vect_analyze_loop_operations.
2072
2073    Scan the loop stmts and make sure they are all vectorizable.  */
2074
2075 static opt_result
2076 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
2077 {
2078   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2079   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2080   int nbbs = loop->num_nodes;
2081   int i;
2082   stmt_vec_info stmt_info;
2083   bool need_to_vectorize = false;
2084   bool ok;
2085
2086   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
2087
2088   auto_vec<stmt_info_for_cost> cost_vec;
2089
2090   for (i = 0; i < nbbs; i++)
2091     {
2092       basic_block bb = bbs[i];
2093
2094       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
2095            gsi_next (&si))
2096         {
2097           gphi *phi = si.phi ();
2098           ok = true;
2099
2100           stmt_info = loop_vinfo->lookup_stmt (phi);
2101           if (dump_enabled_p ())
2102             dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G",
2103                              (gimple *) phi);
2104           if (virtual_operand_p (gimple_phi_result (phi)))
2105             continue;
2106
2107           /* Inner-loop loop-closed exit phi in outer-loop vectorization
2108              (i.e., a phi in the tail of the outer-loop).  */
2109           if (! is_loop_header_bb_p (bb))
2110             {
2111               /* FORNOW: we currently don't support the case that these phis
2112                  are not used in the outerloop (unless it is double reduction,
2113                  i.e., this phi is vect_reduction_def), cause this case
2114                  requires to actually do something here.  */
2115               if (STMT_VINFO_LIVE_P (stmt_info)
2116                   && !vect_active_double_reduction_p (stmt_info))
2117                 return opt_result::failure_at (phi,
2118                                                "Unsupported loop-closed phi"
2119                                                " in outer-loop.\n");
2120
2121               /* If PHI is used in the outer loop, we check that its operand
2122                  is defined in the inner loop.  */
2123               if (STMT_VINFO_RELEVANT_P (stmt_info))
2124                 {
2125                   tree phi_op;
2126
2127                   if (gimple_phi_num_args (phi) != 1)
2128                     return opt_result::failure_at (phi, "unsupported phi");
2129
2130                   phi_op = PHI_ARG_DEF (phi, 0);
2131                   stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
2132                   if (!op_def_info)
2133                     return opt_result::failure_at (phi, "unsupported phi\n");
2134
2135                   if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
2136                       && (STMT_VINFO_RELEVANT (op_def_info)
2137                           != vect_used_in_outer_by_reduction))
2138                     return opt_result::failure_at (phi, "unsupported phi\n");
2139
2140                   if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2141                        || (STMT_VINFO_DEF_TYPE (stmt_info)
2142                            == vect_double_reduction_def))
2143                       && !vectorizable_lc_phi (loop_vinfo,
2144                                                stmt_info, NULL, NULL))
2145                     return opt_result::failure_at (phi, "unsupported phi\n");
2146                 }
2147
2148               continue;
2149             }
2150
2151           gcc_assert (stmt_info);
2152
2153           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
2154                || STMT_VINFO_LIVE_P (stmt_info))
2155               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def
2156               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
2157             /* A scalar-dependence cycle that we don't support.  */
2158             return opt_result::failure_at (phi,
2159                                            "not vectorized:"
2160                                            " scalar dependence cycle.\n");
2161
2162           if (STMT_VINFO_RELEVANT_P (stmt_info))
2163             {
2164               need_to_vectorize = true;
2165               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
2166                   && ! PURE_SLP_STMT (stmt_info))
2167                 ok = vectorizable_induction (loop_vinfo,
2168                                              stmt_info, NULL, NULL,
2169                                              &cost_vec);
2170               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2171                         || (STMT_VINFO_DEF_TYPE (stmt_info)
2172                             == vect_double_reduction_def)
2173                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
2174                        && ! PURE_SLP_STMT (stmt_info))
2175                 ok = vectorizable_reduction (loop_vinfo,
2176                                              stmt_info, NULL, NULL, &cost_vec);
2177               else if ((STMT_VINFO_DEF_TYPE (stmt_info)
2178                         == vect_first_order_recurrence)
2179                        && ! PURE_SLP_STMT (stmt_info))
2180                 ok = vectorizable_recurr (loop_vinfo, stmt_info, NULL, NULL,
2181                                            &cost_vec);
2182             }
2183
2184           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
2185           if (ok
2186               && STMT_VINFO_LIVE_P (stmt_info)
2187               && !PURE_SLP_STMT (stmt_info))
2188             ok = vectorizable_live_operation (loop_vinfo, stmt_info, NULL, NULL,
2189                                               -1, false, &cost_vec);
2190
2191           if (!ok)
2192             return opt_result::failure_at (phi,
2193                                            "not vectorized: relevant phi not "
2194                                            "supported: %G",
2195                                            static_cast <gimple *> (phi));
2196         }
2197
2198       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
2199            gsi_next (&si))
2200         {
2201           gimple *stmt = gsi_stmt (si);
2202           if (!gimple_clobber_p (stmt)
2203               && !is_gimple_debug (stmt))
2204             {
2205               opt_result res
2206                 = vect_analyze_stmt (loop_vinfo,
2207                                      loop_vinfo->lookup_stmt (stmt),
2208                                      &need_to_vectorize,
2209                                      NULL, NULL, &cost_vec);
2210               if (!res)
2211                 return res;
2212             }
2213         }
2214     } /* bbs */
2215
2216   add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
2217
2218   /* All operations in the loop are either irrelevant (deal with loop
2219      control, or dead), or only used outside the loop and can be moved
2220      out of the loop (e.g. invariants, inductions).  The loop can be
2221      optimized away by scalar optimizations.  We're better off not
2222      touching this loop.  */
2223   if (!need_to_vectorize)
2224     {
2225       if (dump_enabled_p ())
2226         dump_printf_loc (MSG_NOTE, vect_location,
2227                          "All the computation can be taken out of the loop.\n");
2228       return opt_result::failure_at
2229         (vect_location,
2230          "not vectorized: redundant loop. no profit to vectorize.\n");
2231     }
2232
2233   return opt_result::success ();
2234 }
2235
2236 /* Return true if we know that the iteration count is smaller than the
2237    vectorization factor.  Return false if it isn't, or if we can't be sure
2238    either way.  */
2239
2240 static bool
2241 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
2242 {
2243   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2244
2245   HOST_WIDE_INT max_niter;
2246   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2247     max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
2248   else
2249     max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2250
2251   if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
2252     return true;
2253
2254   return false;
2255 }
2256
2257 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
2258    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
2259    definitely no, or -1 if it's worth retrying.  */
2260
2261 static int
2262 vect_analyze_loop_costing (loop_vec_info loop_vinfo,
2263                            unsigned *suggested_unroll_factor)
2264 {
2265   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2266   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2267
2268   /* Only loops that can handle partially-populated vectors can have iteration
2269      counts less than the vectorization factor.  */
2270   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2271       && vect_known_niters_smaller_than_vf (loop_vinfo))
2272     {
2273       if (dump_enabled_p ())
2274         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2275                          "not vectorized: iteration count smaller than "
2276                          "vectorization factor.\n");
2277       return 0;
2278     }
2279
2280   /* If we know the number of iterations we can do better, for the
2281      epilogue we can also decide whether the main loop leaves us
2282      with enough iterations, prefering a smaller vector epilog then
2283      also possibly used for the case we skip the vector loop.  */
2284   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2285     {
2286       widest_int scalar_niters
2287         = wi::to_widest (LOOP_VINFO_NITERSM1 (loop_vinfo)) + 1;
2288       if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2289         {
2290           loop_vec_info orig_loop_vinfo
2291             = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2292           unsigned lowest_vf
2293             = constant_lower_bound (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo));
2294           int prolog_peeling = 0;
2295           if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2296             prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo);
2297           if (prolog_peeling >= 0
2298               && known_eq (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
2299                            lowest_vf))
2300             {
2301               unsigned gap
2302                 = LOOP_VINFO_PEELING_FOR_GAPS (orig_loop_vinfo) ? 1 : 0;
2303               scalar_niters = ((scalar_niters - gap - prolog_peeling)
2304                                % lowest_vf + gap);
2305             }
2306         }
2307       /* Reject vectorizing for a single scalar iteration, even if
2308          we could in principle implement that using partial vectors.  */
2309       unsigned peeling_gap = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2310       if (scalar_niters <= peeling_gap + 1)
2311         {
2312           if (dump_enabled_p ())
2313             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2314                              "not vectorized: loop only has a single "
2315                              "scalar iteration.\n");
2316           return 0;
2317         }
2318
2319       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2320         {
2321           /* Check that the loop processes at least one full vector.  */
2322           poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2323           if (known_lt (scalar_niters, vf))
2324             {
2325               if (dump_enabled_p ())
2326                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2327                                  "loop does not have enough iterations "
2328                                  "to support vectorization.\n");
2329               return 0;
2330             }
2331
2332           /* If we need to peel an extra epilogue iteration to handle data
2333              accesses with gaps, check that there are enough scalar iterations
2334              available.
2335
2336              The check above is redundant with this one when peeling for gaps,
2337              but the distinction is useful for diagnostics.  */
2338           if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2339               && known_le (scalar_niters, vf))
2340             {
2341               if (dump_enabled_p ())
2342                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2343                                  "loop does not have enough iterations "
2344                                  "to support peeling for gaps.\n");
2345               return 0;
2346             }
2347         }
2348     }
2349
2350   /* If using the "very cheap" model. reject cases in which we'd keep
2351      a copy of the scalar code (even if we might be able to vectorize it).  */
2352   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2353       && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2354           || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2355           || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
2356     {
2357       if (dump_enabled_p ())
2358         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2359                          "some scalar iterations would need to be peeled\n");
2360       return 0;
2361     }
2362
2363   int min_profitable_iters, min_profitable_estimate;
2364   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2365                                       &min_profitable_estimate,
2366                                       suggested_unroll_factor);
2367
2368   if (min_profitable_iters < 0)
2369     {
2370       if (dump_enabled_p ())
2371         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2372                          "not vectorized: vectorization not profitable.\n");
2373       if (dump_enabled_p ())
2374         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2375                          "not vectorized: vector version will never be "
2376                          "profitable.\n");
2377       return -1;
2378     }
2379
2380   int min_scalar_loop_bound = (param_min_vect_loop_bound
2381                                * assumed_vf);
2382
2383   /* Use the cost model only if it is more conservative than user specified
2384      threshold.  */
2385   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
2386                                     min_profitable_iters);
2387
2388   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2389
2390   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2391       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2392     {
2393       if (dump_enabled_p ())
2394         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2395                          "not vectorized: vectorization not profitable.\n");
2396       if (dump_enabled_p ())
2397         dump_printf_loc (MSG_NOTE, vect_location,
2398                          "not vectorized: iteration count smaller than user "
2399                          "specified loop bound parameter or minimum profitable "
2400                          "iterations (whichever is more conservative).\n");
2401       return 0;
2402     }
2403
2404   /* The static profitablity threshold min_profitable_estimate includes
2405      the cost of having to check at runtime whether the scalar loop
2406      should be used instead.  If it turns out that we don't need or want
2407      such a check, the threshold we should use for the static estimate
2408      is simply the point at which the vector loop becomes more profitable
2409      than the scalar loop.  */
2410   if (min_profitable_estimate > min_profitable_iters
2411       && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
2412       && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2413       && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2414       && !vect_apply_runtime_profitability_check_p (loop_vinfo))
2415     {
2416       if (dump_enabled_p ())
2417         dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
2418                          " choice between the scalar and vector loops\n");
2419       min_profitable_estimate = min_profitable_iters;
2420     }
2421
2422   /* If the vector loop needs multiple iterations to be beneficial then
2423      things are probably too close to call, and the conservative thing
2424      would be to stick with the scalar code.  */
2425   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2426       && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
2427     {
2428       if (dump_enabled_p ())
2429         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2430                          "one iteration of the vector loop would be"
2431                          " more expensive than the equivalent number of"
2432                          " iterations of the scalar loop\n");
2433       return 0;
2434     }
2435
2436   HOST_WIDE_INT estimated_niter;
2437
2438   /* If we are vectorizing an epilogue then we know the maximum number of
2439      scalar iterations it will cover is at least one lower than the
2440      vectorization factor of the main loop.  */
2441   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2442     estimated_niter
2443       = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
2444   else
2445     {
2446       estimated_niter = estimated_stmt_executions_int (loop);
2447       if (estimated_niter == -1)
2448         estimated_niter = likely_max_stmt_executions_int (loop);
2449     }
2450   if (estimated_niter != -1
2451       && ((unsigned HOST_WIDE_INT) estimated_niter
2452           < MAX (th, (unsigned) min_profitable_estimate)))
2453     {
2454       if (dump_enabled_p ())
2455         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2456                          "not vectorized: estimated iteration count too "
2457                          "small.\n");
2458       if (dump_enabled_p ())
2459         dump_printf_loc (MSG_NOTE, vect_location,
2460                          "not vectorized: estimated iteration count smaller "
2461                          "than specified loop bound parameter or minimum "
2462                          "profitable iterations (whichever is more "
2463                          "conservative).\n");
2464       return -1;
2465     }
2466
2467   return 1;
2468 }
2469
2470 static opt_result
2471 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
2472                            vec<data_reference_p> *datarefs,
2473                            unsigned int *n_stmts)
2474 {
2475   *n_stmts = 0;
2476   for (unsigned i = 0; i < loop->num_nodes; i++)
2477     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2478          !gsi_end_p (gsi); gsi_next (&gsi))
2479       {
2480         gimple *stmt = gsi_stmt (gsi);
2481         if (is_gimple_debug (stmt))
2482           continue;
2483         ++(*n_stmts);
2484         opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
2485                                                         NULL, 0);
2486         if (!res)
2487           {
2488             if (is_gimple_call (stmt) && loop->safelen)
2489               {
2490                 tree fndecl = gimple_call_fndecl (stmt), op;
2491                 if (fndecl == NULL_TREE
2492                     && gimple_call_internal_p (stmt, IFN_MASK_CALL))
2493                   {
2494                     fndecl = gimple_call_arg (stmt, 0);
2495                     gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
2496                     fndecl = TREE_OPERAND (fndecl, 0);
2497                     gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
2498                   }
2499                 if (fndecl != NULL_TREE)
2500                   {
2501                     cgraph_node *node = cgraph_node::get (fndecl);
2502                     if (node != NULL && node->simd_clones != NULL)
2503                       {
2504                         unsigned int j, n = gimple_call_num_args (stmt);
2505                         for (j = 0; j < n; j++)
2506                           {
2507                             op = gimple_call_arg (stmt, j);
2508                             if (DECL_P (op)
2509                                 || (REFERENCE_CLASS_P (op)
2510                                     && get_base_address (op)))
2511                               break;
2512                           }
2513                         op = gimple_call_lhs (stmt);
2514                         /* Ignore #pragma omp declare simd functions
2515                            if they don't have data references in the
2516                            call stmt itself.  */
2517                         if (j == n
2518                             && !(op
2519                                  && (DECL_P (op)
2520                                      || (REFERENCE_CLASS_P (op)
2521                                          && get_base_address (op)))))
2522                           continue;
2523                       }
2524                   }
2525               }
2526             return res;
2527           }
2528         /* If dependence analysis will give up due to the limit on the
2529            number of datarefs stop here and fail fatally.  */
2530         if (datarefs->length ()
2531             > (unsigned)param_loop_max_datarefs_for_datadeps)
2532           return opt_result::failure_at (stmt, "exceeded param "
2533                                          "loop-max-datarefs-for-datadeps\n");
2534       }
2535   return opt_result::success ();
2536 }
2537
2538 /* Look for SLP-only access groups and turn each individual access into its own
2539    group.  */
2540 static void
2541 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2542 {
2543   unsigned int i;
2544   struct data_reference *dr;
2545
2546   DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2547
2548   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2549   FOR_EACH_VEC_ELT (datarefs, i, dr)
2550     {
2551       gcc_assert (DR_REF (dr));
2552       stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
2553
2554       /* Check if the load is a part of an interleaving chain.  */
2555       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2556         {
2557           stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2558           dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2559           unsigned int group_size = DR_GROUP_SIZE (first_element);
2560
2561           /* Check if SLP-only groups.  */
2562           if (!STMT_SLP_TYPE (stmt_info)
2563               && STMT_VINFO_SLP_VECT_ONLY (first_element))
2564             {
2565               /* Dissolve the group.  */
2566               STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2567
2568               stmt_vec_info vinfo = first_element;
2569               while (vinfo)
2570                 {
2571                   stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2572                   DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2573                   DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2574                   DR_GROUP_SIZE (vinfo) = 1;
2575                   if (STMT_VINFO_STRIDED_P (first_element)
2576                       /* We cannot handle stores with gaps.  */
2577                       || DR_IS_WRITE (dr_info->dr))
2578                     {
2579                       STMT_VINFO_STRIDED_P (vinfo) = true;
2580                       DR_GROUP_GAP (vinfo) = 0;
2581                     }
2582                   else
2583                     DR_GROUP_GAP (vinfo) = group_size - 1;
2584                   /* Duplicate and adjust alignment info, it needs to
2585                      be present on each group leader, see dr_misalignment.  */
2586                   if (vinfo != first_element)
2587                     {
2588                       dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2589                       dr_info2->target_alignment = dr_info->target_alignment;
2590                       int misalignment = dr_info->misalignment;
2591                       if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2592                         {
2593                           HOST_WIDE_INT diff
2594                             = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2595                                - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2596                           unsigned HOST_WIDE_INT align_c
2597                             = dr_info->target_alignment.to_constant ();
2598                           misalignment = (misalignment + diff) % align_c;
2599                         }
2600                       dr_info2->misalignment = misalignment;
2601                     }
2602                   vinfo = next;
2603                 }
2604             }
2605         }
2606     }
2607 }
2608
2609 /* Determine if operating on full vectors for LOOP_VINFO might leave
2610    some scalar iterations still to do.  If so, decide how we should
2611    handle those scalar iterations.  The possibilities are:
2612
2613    (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2614        In this case:
2615
2616          LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2617          LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2618          LOOP_VINFO_PEELING_FOR_NITER == false
2619
2620    (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2621        to handle the remaining scalar iterations.  In this case:
2622
2623          LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2624          LOOP_VINFO_PEELING_FOR_NITER == true
2625
2626        There are two choices:
2627
2628        (2a) Consider vectorizing the epilogue loop at the same VF as the
2629             main loop, but using partial vectors instead of full vectors.
2630             In this case:
2631
2632               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2633
2634        (2b) Consider vectorizing the epilogue loop at lower VFs only.
2635             In this case:
2636
2637               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2638  */
2639
2640 opt_result
2641 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo)
2642 {
2643   /* Determine whether there would be any scalar iterations left over.  */
2644   bool need_peeling_or_partial_vectors_p
2645     = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2646
2647   /* Decide whether to vectorize the loop with partial vectors.  */
2648   LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2649   LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2650   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2651       && need_peeling_or_partial_vectors_p)
2652     {
2653       /* For partial-vector-usage=1, try to push the handling of partial
2654          vectors to the epilogue, with the main loop continuing to operate
2655          on full vectors.
2656
2657          If we are unrolling we also do not want to use partial vectors. This
2658          is to avoid the overhead of generating multiple masks and also to
2659          avoid having to execute entire iterations of FALSE masked instructions
2660          when dealing with one or less full iterations.
2661
2662          ??? We could then end up failing to use partial vectors if we
2663          decide to peel iterations into a prologue, and if the main loop
2664          then ends up processing fewer than VF iterations.  */
2665       if ((param_vect_partial_vector_usage == 1
2666            || loop_vinfo->suggested_unroll_factor > 1)
2667           && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2668           && !vect_known_niters_smaller_than_vf (loop_vinfo))
2669         LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2670       else
2671         LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2672     }
2673
2674   if (dump_enabled_p ())
2675     dump_printf_loc (MSG_NOTE, vect_location,
2676                      "operating on %s vectors%s.\n",
2677                      LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2678                      ? "partial" : "full",
2679                      LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2680                      ? " for epilogue loop" : "");
2681
2682   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2683     = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2684        && need_peeling_or_partial_vectors_p);
2685
2686   /* We set LOOP_VINFO_USING_SELECT_VL_P as true before loop vectorization
2687      analysis that we don't know whether the loop is vectorized by partial
2688      vectors (More details see tree-vect-loop-manip.cc).
2689
2690      However, SELECT_VL vectorizaton style should only applied on partial
2691      vectorization since SELECT_VL is the GIMPLE IR that calculates the
2692      number of elements to be process for each iteration.
2693
2694      After loop vectorization analysis, Clear LOOP_VINFO_USING_SELECT_VL_P
2695      if it is not partial vectorized loop.  */
2696   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2697     LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = false;
2698
2699   return opt_result::success ();
2700 }
2701
2702 /* Function vect_analyze_loop_2.
2703
2704    Apply a set of analyses on LOOP specified by LOOP_VINFO, the different
2705    analyses will record information in some members of LOOP_VINFO.  FATAL
2706    indicates if some analysis meets fatal error.  If one non-NULL pointer
2707    SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one
2708    worked out suggested unroll factor, while one NULL pointer shows it's
2709    going to apply the suggested unroll factor.  SLP_DONE_FOR_SUGGESTED_UF
2710    is to hold the slp decision when the suggested unroll factor is worked
2711    out.  */
2712 static opt_result
2713 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
2714                      unsigned *suggested_unroll_factor,
2715                      bool& slp_done_for_suggested_uf)
2716 {
2717   opt_result ok = opt_result::success ();
2718   int res;
2719   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2720   poly_uint64 min_vf = 2;
2721   loop_vec_info orig_loop_vinfo = NULL;
2722
2723   /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2724      loop_vec_info of the first vectorized loop.  */
2725   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2726     orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2727   else
2728     orig_loop_vinfo = loop_vinfo;
2729   gcc_assert (orig_loop_vinfo);
2730
2731   /* The first group of checks is independent of the vector size.  */
2732   fatal = true;
2733
2734   if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2735       && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2736     return opt_result::failure_at (vect_location,
2737                                    "not vectorized: simd if(0)\n");
2738
2739   /* Find all data references in the loop (which correspond to vdefs/vuses)
2740      and analyze their evolution in the loop.  */
2741
2742   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2743
2744   /* Gather the data references and count stmts in the loop.  */
2745   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2746     {
2747       opt_result res
2748         = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2749                                      &LOOP_VINFO_DATAREFS (loop_vinfo),
2750                                      &LOOP_VINFO_N_STMTS (loop_vinfo));
2751       if (!res)
2752         {
2753           if (dump_enabled_p ())
2754             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2755                              "not vectorized: loop contains function "
2756                              "calls or data references that cannot "
2757                              "be analyzed\n");
2758           return res;
2759         }
2760       loop_vinfo->shared->save_datarefs ();
2761     }
2762   else
2763     loop_vinfo->shared->check_datarefs ();
2764
2765   /* Analyze the data references and also adjust the minimal
2766      vectorization factor according to the loads and stores.  */
2767
2768   ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2769   if (!ok)
2770     {
2771       if (dump_enabled_p ())
2772         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2773                          "bad data references.\n");
2774       return ok;
2775     }
2776
2777   /* Check if we are applying unroll factor now.  */
2778   bool applying_suggested_uf = loop_vinfo->suggested_unroll_factor > 1;
2779   gcc_assert (!applying_suggested_uf || !suggested_unroll_factor);
2780
2781   /* If the slp decision is false when suggested unroll factor is worked
2782      out, and we are applying suggested unroll factor, we can simply skip
2783      all slp related analyses this time.  */
2784   bool slp = !applying_suggested_uf || slp_done_for_suggested_uf;
2785
2786   /* Classify all cross-iteration scalar data-flow cycles.
2787      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
2788   vect_analyze_scalar_cycles (loop_vinfo, slp);
2789
2790   vect_pattern_recog (loop_vinfo);
2791
2792   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2793
2794   /* Analyze the access patterns of the data-refs in the loop (consecutive,
2795      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
2796
2797   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2798   if (!ok)
2799     {
2800       if (dump_enabled_p ())
2801         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2802                          "bad data access.\n");
2803       return ok;
2804     }
2805
2806   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
2807
2808   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2809   if (!ok)
2810     {
2811       if (dump_enabled_p ())
2812         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2813                          "unexpected pattern.\n");
2814       return ok;
2815     }
2816
2817   /* While the rest of the analysis below depends on it in some way.  */
2818   fatal = false;
2819
2820   /* Analyze data dependences between the data-refs in the loop
2821      and adjust the maximum vectorization factor according to
2822      the dependences.
2823      FORNOW: fail at the first data dependence that we encounter.  */
2824
2825   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2826   if (!ok)
2827     {
2828       if (dump_enabled_p ())
2829         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2830                          "bad data dependence.\n");
2831       return ok;
2832     }
2833   if (max_vf != MAX_VECTORIZATION_FACTOR
2834       && maybe_lt (max_vf, min_vf))
2835     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2836   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2837
2838   ok = vect_determine_vectorization_factor (loop_vinfo);
2839   if (!ok)
2840     {
2841       if (dump_enabled_p ())
2842         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2843                          "can't determine vectorization factor.\n");
2844       return ok;
2845     }
2846
2847   /* Compute the scalar iteration cost.  */
2848   vect_compute_single_scalar_iteration_cost (loop_vinfo);
2849
2850   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2851
2852   if (slp)
2853     {
2854       /* Check the SLP opportunities in the loop, analyze and build
2855          SLP trees.  */
2856       ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo));
2857       if (!ok)
2858         return ok;
2859
2860       /* If there are any SLP instances mark them as pure_slp.  */
2861       slp = vect_make_slp_decision (loop_vinfo);
2862       if (slp)
2863         {
2864           /* Find stmts that need to be both vectorized and SLPed.  */
2865           vect_detect_hybrid_slp (loop_vinfo);
2866
2867           /* Update the vectorization factor based on the SLP decision.  */
2868           vect_update_vf_for_slp (loop_vinfo);
2869
2870           /* Optimize the SLP graph with the vectorization factor fixed.  */
2871           vect_optimize_slp (loop_vinfo);
2872
2873           /* Gather the loads reachable from the SLP graph entries.  */
2874           vect_gather_slp_loads (loop_vinfo);
2875         }
2876     }
2877
2878   bool saved_can_use_partial_vectors_p
2879     = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2880
2881   /* We don't expect to have to roll back to anything other than an empty
2882      set of rgroups.  */
2883   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2884
2885   /* This is the point where we can re-start analysis with SLP forced off.  */
2886 start_over:
2887
2888   /* Apply the suggested unrolling factor, this was determined by the backend
2889      during finish_cost the first time we ran the analyzis for this
2890      vector mode.  */
2891   if (applying_suggested_uf)
2892     LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2893
2894   /* Now the vectorization factor is final.  */
2895   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2896   gcc_assert (known_ne (vectorization_factor, 0U));
2897
2898   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2899     {
2900       dump_printf_loc (MSG_NOTE, vect_location,
2901                        "vectorization_factor = ");
2902       dump_dec (MSG_NOTE, vectorization_factor);
2903       dump_printf (MSG_NOTE, ", niters = %wd\n",
2904                    LOOP_VINFO_INT_NITERS (loop_vinfo));
2905     }
2906
2907   if (max_vf != MAX_VECTORIZATION_FACTOR
2908       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2909     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2910
2911   loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2912
2913   /* Analyze the alignment of the data-refs in the loop.
2914      Fail if a data reference is found that cannot be vectorized.  */
2915
2916   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2917   if (!ok)
2918     {
2919       if (dump_enabled_p ())
2920         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2921                          "bad data alignment.\n");
2922       return ok;
2923     }
2924
2925   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2926      It is important to call pruning after vect_analyze_data_ref_accesses,
2927      since we use grouping information gathered by interleaving analysis.  */
2928   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2929   if (!ok)
2930     return ok;
2931
2932   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2933      vectorization, since we do not want to add extra peeling or
2934      add versioning for alignment.  */
2935   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2936     /* This pass will decide on using loop versioning and/or loop peeling in
2937        order to enhance the alignment of data references in the loop.  */
2938     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2939   if (!ok)
2940     return ok;
2941
2942   if (slp)
2943     {
2944       /* Analyze operations in the SLP instances.  Note this may
2945          remove unsupported SLP instances which makes the above
2946          SLP kind detection invalid.  */
2947       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2948       vect_slp_analyze_operations (loop_vinfo);
2949       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2950         {
2951           ok = opt_result::failure_at (vect_location,
2952                                        "unsupported SLP instances\n");
2953           goto again;
2954         }
2955
2956       /* Check whether any load in ALL SLP instances is possibly permuted.  */
2957       slp_tree load_node, slp_root;
2958       unsigned i, x;
2959       slp_instance instance;
2960       bool can_use_lanes = true;
2961       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2962         {
2963           slp_root = SLP_INSTANCE_TREE (instance);
2964           int group_size = SLP_TREE_LANES (slp_root);
2965           tree vectype = SLP_TREE_VECTYPE (slp_root);
2966           bool loads_permuted = false;
2967           FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2968             {
2969               if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2970                 continue;
2971               unsigned j;
2972               stmt_vec_info load_info;
2973               FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2974                 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2975                   {
2976                     loads_permuted = true;
2977                     break;
2978                   }
2979             }
2980
2981           /* If the loads and stores can be handled with load/store-lane
2982              instructions record it and move on to the next instance.  */
2983           if (loads_permuted
2984               && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2985               && vect_store_lanes_supported (vectype, group_size, false)
2986                    != IFN_LAST)
2987             {
2988               FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2989                 if (STMT_VINFO_GROUPED_ACCESS
2990                       (SLP_TREE_REPRESENTATIVE (load_node)))
2991                   {
2992                     stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2993                         (SLP_TREE_REPRESENTATIVE (load_node));
2994                     /* Use SLP for strided accesses (or if we can't
2995                        load-lanes).  */
2996                     if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2997                         || vect_load_lanes_supported
2998                              (STMT_VINFO_VECTYPE (stmt_vinfo),
2999                               DR_GROUP_SIZE (stmt_vinfo), false) == IFN_LAST)
3000                       break;
3001                   }
3002
3003               can_use_lanes
3004                 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
3005
3006               if (can_use_lanes && dump_enabled_p ())
3007                 dump_printf_loc (MSG_NOTE, vect_location,
3008                                  "SLP instance %p can use load/store-lanes\n",
3009                                  (void *) instance);
3010             }
3011           else
3012             {
3013               can_use_lanes = false;
3014               break;
3015             }
3016         }
3017
3018       /* If all SLP instances can use load/store-lanes abort SLP and try again
3019          with SLP disabled.  */
3020       if (can_use_lanes)
3021         {
3022           ok = opt_result::failure_at (vect_location,
3023                                        "Built SLP cancelled: can use "
3024                                        "load/store-lanes\n");
3025           if (dump_enabled_p ())
3026             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3027                              "Built SLP cancelled: all SLP instances support "
3028                              "load/store-lanes\n");
3029           goto again;
3030         }
3031     }
3032
3033   /* Dissolve SLP-only groups.  */
3034   vect_dissolve_slp_only_groups (loop_vinfo);
3035
3036   /* Scan all the remaining operations in the loop that are not subject
3037      to SLP and make sure they are vectorizable.  */
3038   ok = vect_analyze_loop_operations (loop_vinfo);
3039   if (!ok)
3040     {
3041       if (dump_enabled_p ())
3042         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3043                          "bad operation or unsupported loop bound.\n");
3044       return ok;
3045     }
3046
3047   /* For now, we don't expect to mix both masking and length approaches for one
3048      loop, disable it if both are recorded.  */
3049   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3050       && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
3051       && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
3052     {
3053       if (dump_enabled_p ())
3054         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3055                          "can't vectorize a loop with partial vectors"
3056                          " because we don't expect to mix different"
3057                          " approaches with partial vectors for the"
3058                          " same loop.\n");
3059       LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3060     }
3061
3062   /* If we still have the option of using partial vectors,
3063      check whether we can generate the necessary loop controls.  */
3064   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
3065     {
3066       if (!LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
3067         {
3068           if (!vect_verify_full_masking (loop_vinfo)
3069               && !vect_verify_full_masking_avx512 (loop_vinfo))
3070             LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3071         }
3072       else /* !LOOP_VINFO_LENS (loop_vinfo).is_empty () */
3073         if (!vect_verify_loop_lens (loop_vinfo))
3074           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3075     }
3076
3077   /* If we're vectorizing a loop that uses length "controls" and
3078      can iterate more than once, we apply decrementing IV approach
3079      in loop control.  */
3080   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3081       && LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_len
3082       && LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0
3083       && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3084            && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
3085                         LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
3086     LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
3087
3088   /* If a loop uses length controls and has a decrementing loop control IV,
3089      we will normally pass that IV through a MIN_EXPR to calcaluate the
3090      basis for the length controls.  E.g. in a loop that processes one
3091      element per scalar iteration, the number of elements would be
3092      MIN_EXPR <N, VF>, where N is the number of scalar iterations left.
3093
3094      This MIN_EXPR approach allows us to use pointer IVs with an invariant
3095      step, since only the final iteration of the vector loop can have
3096      inactive lanes.
3097
3098      However, some targets have a dedicated instruction for calculating the
3099      preferred length, given the total number of elements that still need to
3100      be processed.  This is encapsulated in the SELECT_VL internal function.
3101
3102      If the target supports SELECT_VL, we can use it instead of MIN_EXPR
3103      to determine the basis for the length controls.  However, unlike the
3104      MIN_EXPR calculation, the SELECT_VL calculation can decide to make
3105      lanes inactive in any iteration of the vector loop, not just the last
3106      iteration.  This SELECT_VL approach therefore requires us to use pointer
3107      IVs with variable steps.
3108
3109      Once we've decided how many elements should be processed by one
3110      iteration of the vector loop, we need to populate the rgroup controls.
3111      If a loop has multiple rgroups, we need to make sure that those rgroups
3112      "line up" (that is, they must be consistent about which elements are
3113      active and which aren't).  This is done by vect_adjust_loop_lens_control.
3114
3115      In principle, it would be possible to use vect_adjust_loop_lens_control
3116      on either the result of a MIN_EXPR or the result of a SELECT_VL.
3117      However:
3118
3119      (1) In practice, it only makes sense to use SELECT_VL when a vector
3120          operation will be controlled directly by the result.  It is not
3121          worth using SELECT_VL if it would only be the input to other
3122          calculations.
3123
3124      (2) If we use SELECT_VL for an rgroup that has N controls, each associated
3125          pointer IV will need N updates by a variable amount (N-1 updates
3126          within the iteration and 1 update to move to the next iteration).
3127
3128      Because of this, we prefer to use the MIN_EXPR approach whenever there
3129      is more than one length control.
3130
3131      In addition, SELECT_VL always operates to a granularity of 1 unit.
3132      If we wanted to use it to control an SLP operation on N consecutive
3133      elements, we would need to make the SELECT_VL inputs measure scalar
3134      iterations (rather than elements) and then multiply the SELECT_VL
3135      result by N.  But using SELECT_VL this way is inefficient because
3136      of (1) above.
3137
3138      2. We don't apply SELECT_VL on single-rgroup when both (1) and (2) are
3139         satisfied:
3140
3141      (1). LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) is true.
3142      (2). LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () is true.
3143
3144      Since SELECT_VL (variable step) will make SCEV analysis failed and then
3145      we will fail to gain benefits of following unroll optimizations. We prefer
3146      using the MIN_EXPR approach in this situation.  */
3147   if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
3148     {
3149       tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
3150       if (direct_internal_fn_supported_p (IFN_SELECT_VL, iv_type,
3151                                           OPTIMIZE_FOR_SPEED)
3152           && LOOP_VINFO_LENS (loop_vinfo).length () == 1
3153           && LOOP_VINFO_LENS (loop_vinfo)[0].factor == 1 && !slp
3154           && (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3155               || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()))
3156         LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true;
3157     }
3158
3159   /* Decide whether this loop_vinfo should use partial vectors or peeling,
3160      assuming that the loop will be used as a main loop.  We will redo
3161      this analysis later if we instead decide to use the loop as an
3162      epilogue loop.  */
3163   ok = vect_determine_partial_vectors_and_peeling (loop_vinfo);
3164   if (!ok)
3165     return ok;
3166
3167   /* If we're vectorizing an epilogue loop, the vectorized loop either needs
3168      to be able to handle fewer than VF scalars, or needs to have a lower VF
3169      than the main loop.  */
3170   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
3171       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3172     {
3173       poly_uint64 unscaled_vf
3174         = exact_div (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
3175                      orig_loop_vinfo->suggested_unroll_factor);
3176       if (maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo), unscaled_vf))
3177         return opt_result::failure_at (vect_location,
3178                                        "Vectorization factor too high for"
3179                                        " epilogue loop.\n");
3180     }
3181
3182   /* Check the costings of the loop make vectorizing worthwhile.  */
3183   res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
3184   if (res < 0)
3185     {
3186       ok = opt_result::failure_at (vect_location,
3187                                    "Loop costings may not be worthwhile.\n");
3188       goto again;
3189     }
3190   if (!res)
3191     return opt_result::failure_at (vect_location,
3192                                    "Loop costings not worthwhile.\n");
3193
3194   /* If an epilogue loop is required make sure we can create one.  */
3195   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
3196       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
3197       || LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3198     {
3199       if (dump_enabled_p ())
3200         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
3201       if (!vect_can_advance_ivs_p (loop_vinfo)
3202           || !slpeel_can_duplicate_loop_p (loop,
3203                                            LOOP_VINFO_IV_EXIT (loop_vinfo),
3204                                            LOOP_VINFO_IV_EXIT (loop_vinfo)))
3205         {
3206           ok = opt_result::failure_at (vect_location,
3207                                        "not vectorized: can't create required "
3208                                        "epilog loop\n");
3209           goto again;
3210         }
3211     }
3212
3213   /* During peeling, we need to check if number of loop iterations is
3214      enough for both peeled prolog loop and vector loop.  This check
3215      can be merged along with threshold check of loop versioning, so
3216      increase threshold for this case if necessary.
3217
3218      If we are analyzing an epilogue we still want to check what its
3219      versioning threshold would be.  If we decide to vectorize the epilogues we
3220      will want to use the lowest versioning threshold of all epilogues and main
3221      loop.  This will enable us to enter a vectorized epilogue even when
3222      versioning the loop.  We can't simply check whether the epilogue requires
3223      versioning though since we may have skipped some versioning checks when
3224      analyzing the epilogue.  For instance, checks for alias versioning will be
3225      skipped when dealing with epilogues as we assume we already checked them
3226      for the main loop.  So instead we always check the 'orig_loop_vinfo'.  */
3227   if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
3228     {
3229       poly_uint64 niters_th = 0;
3230       unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3231
3232       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
3233         {
3234           /* Niters for peeled prolog loop.  */
3235           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3236             {
3237               dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
3238               tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
3239               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
3240             }
3241           else
3242             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3243         }
3244
3245       /* Niters for at least one iteration of vectorized loop.  */
3246       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3247         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3248       /* One additional iteration because of peeling for gap.  */
3249       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3250         niters_th += 1;
3251
3252       /*  Use the same condition as vect_transform_loop to decide when to use
3253           the cost to determine a versioning threshold.  */
3254       if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3255           && ordered_p (th, niters_th))
3256         niters_th = ordered_max (poly_uint64 (th), niters_th);
3257
3258       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
3259     }
3260
3261   gcc_assert (known_eq (vectorization_factor,
3262                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
3263
3264   slp_done_for_suggested_uf = slp;
3265
3266   /* Ok to vectorize!  */
3267   LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
3268   return opt_result::success ();
3269
3270 again:
3271   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
3272   gcc_assert (!ok);
3273
3274   /* Try again with SLP forced off but if we didn't do any SLP there is
3275      no point in re-trying.  */
3276   if (!slp)
3277     return ok;
3278
3279   /* If the slp decision is true when suggested unroll factor is worked
3280      out, and we are applying suggested unroll factor, we don't need to
3281      re-try any more.  */
3282   if (applying_suggested_uf && slp_done_for_suggested_uf)
3283     return ok;
3284
3285   /* If there are reduction chains re-trying will fail anyway.  */
3286   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
3287     return ok;
3288
3289   /* Likewise if the grouped loads or stores in the SLP cannot be handled
3290      via interleaving or lane instructions.  */
3291   slp_instance instance;
3292   slp_tree node;
3293   unsigned i, j;
3294   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
3295     {
3296       stmt_vec_info vinfo;
3297       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
3298       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
3299         continue;
3300       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3301       unsigned int size = DR_GROUP_SIZE (vinfo);
3302       tree vectype = STMT_VINFO_VECTYPE (vinfo);
3303       if (vect_store_lanes_supported (vectype, size, false) == IFN_LAST
3304          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
3305          && ! vect_grouped_store_supported (vectype, size))
3306         return opt_result::failure_at (vinfo->stmt,
3307                                        "unsupported grouped store\n");
3308       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
3309         {
3310           vinfo = SLP_TREE_REPRESENTATIVE (node);
3311           if (STMT_VINFO_GROUPED_ACCESS (vinfo))
3312             {
3313               vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3314               bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
3315               size = DR_GROUP_SIZE (vinfo);
3316               vectype = STMT_VINFO_VECTYPE (vinfo);
3317               if (vect_load_lanes_supported (vectype, size, false) == IFN_LAST
3318                   && ! vect_grouped_load_supported (vectype, single_element_p,
3319                                                     size))
3320                 return opt_result::failure_at (vinfo->stmt,
3321                                                "unsupported grouped load\n");
3322             }
3323         }
3324     }
3325
3326   if (dump_enabled_p ())
3327     dump_printf_loc (MSG_NOTE, vect_location,
3328                      "re-trying with SLP disabled\n");
3329
3330   /* Roll back state appropriately.  No SLP this time.  */
3331   slp = false;
3332   /* Restore vectorization factor as it were without SLP.  */
3333   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
3334   /* Free the SLP instances.  */
3335   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
3336     vect_free_slp_instance (instance);
3337   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
3338   /* Reset SLP type to loop_vect on all stmts.  */
3339   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
3340     {
3341       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3342       for (gimple_stmt_iterator si = gsi_start_phis (bb);
3343            !gsi_end_p (si); gsi_next (&si))
3344         {
3345           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3346           STMT_SLP_TYPE (stmt_info) = loop_vect;
3347           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
3348               || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
3349             {
3350               /* vectorizable_reduction adjusts reduction stmt def-types,
3351                  restore them to that of the PHI.  */
3352               STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
3353                 = STMT_VINFO_DEF_TYPE (stmt_info);
3354               STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
3355                                         (STMT_VINFO_REDUC_DEF (stmt_info)))
3356                 = STMT_VINFO_DEF_TYPE (stmt_info);
3357             }
3358         }
3359       for (gimple_stmt_iterator si = gsi_start_bb (bb);
3360            !gsi_end_p (si); gsi_next (&si))
3361         {
3362           if (is_gimple_debug (gsi_stmt (si)))
3363             continue;
3364           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3365           STMT_SLP_TYPE (stmt_info) = loop_vect;
3366           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
3367             {
3368               stmt_vec_info pattern_stmt_info
3369                 = STMT_VINFO_RELATED_STMT (stmt_info);
3370               if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
3371                 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
3372
3373               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
3374               STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
3375               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
3376                    !gsi_end_p (pi); gsi_next (&pi))
3377                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
3378                   = loop_vect;
3379             }
3380         }
3381     }
3382   /* Free optimized alias test DDRS.  */
3383   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
3384   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
3385   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
3386   /* Reset target cost data.  */
3387   delete loop_vinfo->vector_costs;
3388   loop_vinfo->vector_costs = nullptr;
3389   /* Reset accumulated rgroup information.  */
3390   LOOP_VINFO_MASKS (loop_vinfo).mask_set.empty ();
3391   release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
3392   release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
3393   /* Reset assorted flags.  */
3394   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
3395   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
3396   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
3397   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
3398   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3399     = saved_can_use_partial_vectors_p;
3400
3401   goto start_over;
3402 }
3403
3404 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
3405    to be better than vectorizing it using OLD_LOOP_VINFO.  Assume that
3406    OLD_LOOP_VINFO is better unless something specifically indicates
3407    otherwise.
3408
3409    Note that this deliberately isn't a partial order.  */
3410
3411 static bool
3412 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
3413                           loop_vec_info old_loop_vinfo)
3414 {
3415   struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
3416   gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
3417
3418   poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
3419   poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
3420
3421   /* Always prefer a VF of loop->simdlen over any other VF.  */
3422   if (loop->simdlen)
3423     {
3424       bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
3425       bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
3426       if (new_simdlen_p != old_simdlen_p)
3427         return new_simdlen_p;
3428     }
3429
3430   const auto *old_costs = old_loop_vinfo->vector_costs;
3431   const auto *new_costs = new_loop_vinfo->vector_costs;
3432   if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
3433     return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
3434
3435   return new_costs->better_main_loop_than_p (old_costs);
3436 }
3437
3438 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO.  Return
3439    true if we should.  */
3440
3441 static bool
3442 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
3443                         loop_vec_info old_loop_vinfo)
3444 {
3445   if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
3446     return false;
3447
3448   if (dump_enabled_p ())
3449     dump_printf_loc (MSG_NOTE, vect_location,
3450                      "***** Preferring vector mode %s to vector mode %s\n",
3451                      GET_MODE_NAME (new_loop_vinfo->vector_mode),
3452                      GET_MODE_NAME (old_loop_vinfo->vector_mode));
3453   return true;
3454 }
3455
3456 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
3457    not NULL.  Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
3458    MODE_I to the next mode useful to analyze.
3459    Return the loop_vinfo on success and wrapped null on failure.  */
3460
3461 static opt_loop_vec_info
3462 vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
3463                      const vect_loop_form_info *loop_form_info,
3464                      loop_vec_info main_loop_vinfo,
3465                      const vector_modes &vector_modes, unsigned &mode_i,
3466                      machine_mode &autodetected_vector_mode,
3467                      bool &fatal)
3468 {
3469   loop_vec_info loop_vinfo
3470     = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3471
3472   machine_mode vector_mode = vector_modes[mode_i];
3473   loop_vinfo->vector_mode = vector_mode;
3474   unsigned int suggested_unroll_factor = 1;
3475   bool slp_done_for_suggested_uf = false;
3476
3477   /* Run the main analysis.  */
3478   opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,
3479                                         &suggested_unroll_factor,
3480                                         slp_done_for_suggested_uf);
3481   if (dump_enabled_p ())
3482     dump_printf_loc (MSG_NOTE, vect_location,
3483                      "***** Analysis %s with vector mode %s\n",
3484                      res ? "succeeded" : " failed",
3485                      GET_MODE_NAME (loop_vinfo->vector_mode));
3486
3487   if (res && !main_loop_vinfo && suggested_unroll_factor > 1)
3488     {
3489       if (dump_enabled_p ())
3490         dump_printf_loc (MSG_NOTE, vect_location,
3491                          "***** Re-trying analysis for unrolling"
3492                          " with unroll factor %d and slp %s.\n",
3493                          suggested_unroll_factor,
3494                          slp_done_for_suggested_uf ? "on" : "off");
3495       loop_vec_info unroll_vinfo
3496         = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3497       unroll_vinfo->vector_mode = vector_mode;
3498       unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
3499       opt_result new_res = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL,
3500                                                 slp_done_for_suggested_uf);
3501       if (new_res)
3502         {
3503           delete loop_vinfo;
3504           loop_vinfo = unroll_vinfo;
3505         }
3506       else
3507         delete unroll_vinfo;
3508     }
3509
3510   /* Remember the autodetected vector mode.  */
3511   if (vector_mode == VOIDmode)
3512     autodetected_vector_mode = loop_vinfo->vector_mode;
3513
3514   /* Advance mode_i, first skipping modes that would result in the
3515      same analysis result.  */
3516   while (mode_i + 1 < vector_modes.length ()
3517          && vect_chooses_same_modes_p (loop_vinfo,
3518                                        vector_modes[mode_i + 1]))
3519     {
3520       if (dump_enabled_p ())
3521         dump_printf_loc (MSG_NOTE, vect_location,
3522                          "***** The result for vector mode %s would"
3523                          " be the same\n",
3524                          GET_MODE_NAME (vector_modes[mode_i + 1]));
3525       mode_i += 1;
3526     }
3527   if (mode_i + 1 < vector_modes.length ()
3528       && VECTOR_MODE_P (autodetected_vector_mode)
3529       && (related_vector_mode (vector_modes[mode_i + 1],
3530                                GET_MODE_INNER (autodetected_vector_mode))
3531           == autodetected_vector_mode)
3532       && (related_vector_mode (autodetected_vector_mode,
3533                                GET_MODE_INNER (vector_modes[mode_i + 1]))
3534           == vector_modes[mode_i + 1]))
3535     {
3536       if (dump_enabled_p ())
3537         dump_printf_loc (MSG_NOTE, vect_location,
3538                          "***** Skipping vector mode %s, which would"
3539                          " repeat the analysis for %s\n",
3540                          GET_MODE_NAME (vector_modes[mode_i + 1]),
3541                          GET_MODE_NAME (autodetected_vector_mode));
3542       mode_i += 1;
3543     }
3544   mode_i++;
3545
3546   if (!res)
3547     {
3548       delete loop_vinfo;
3549       if (fatal)
3550         gcc_checking_assert (main_loop_vinfo == NULL);
3551       return opt_loop_vec_info::propagate_failure (res);
3552     }
3553
3554   return opt_loop_vec_info::success (loop_vinfo);
3555 }
3556
3557 /* Function vect_analyze_loop.
3558
3559    Apply a set of analyses on LOOP, and create a loop_vec_info struct
3560    for it.  The different analyses will record information in the
3561    loop_vec_info struct.  */
3562 opt_loop_vec_info
3563 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
3564 {
3565   DUMP_VECT_SCOPE ("analyze_loop_nest");
3566
3567   if (loop_outer (loop)
3568       && loop_vec_info_for_loop (loop_outer (loop))
3569       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
3570     return opt_loop_vec_info::failure_at (vect_location,
3571                                           "outer-loop already vectorized.\n");
3572
3573   if (!find_loop_nest (loop, &shared->loop_nest))
3574     return opt_loop_vec_info::failure_at
3575       (vect_location,
3576        "not vectorized: loop nest containing two or more consecutive inner"
3577        " loops cannot be vectorized\n");
3578
3579   /* Analyze the loop form.  */
3580   vect_loop_form_info loop_form_info;
3581   opt_result res = vect_analyze_loop_form (loop, &loop_form_info);
3582   if (!res)
3583     {
3584       if (dump_enabled_p ())
3585         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3586                          "bad loop form.\n");
3587       return opt_loop_vec_info::propagate_failure (res);
3588     }
3589   if (!integer_onep (loop_form_info.assumptions))
3590     {
3591       /* We consider to vectorize this loop by versioning it under
3592          some assumptions.  In order to do this, we need to clear
3593          existing information computed by scev and niter analyzer.  */
3594       scev_reset_htab ();
3595       free_numbers_of_iterations_estimates (loop);
3596       /* Also set flag for this loop so that following scev and niter
3597          analysis are done under the assumptions.  */
3598       loop_constraint_set (loop, LOOP_C_FINITE);
3599     }
3600   else
3601     /* Clear the existing niter information to make sure the nonwrapping flag
3602        will be calculated and set propriately.  */
3603     free_numbers_of_iterations_estimates (loop);
3604
3605   auto_vector_modes vector_modes;
3606   /* Autodetect first vector size we try.  */
3607   vector_modes.safe_push (VOIDmode);
3608   unsigned int autovec_flags
3609     = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
3610                                                     loop->simdlen != 0);
3611   bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
3612                              && !unlimited_cost_model (loop));
3613   machine_mode autodetected_vector_mode = VOIDmode;
3614   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
3615   unsigned int mode_i = 0;
3616   unsigned HOST_WIDE_INT simdlen = loop->simdlen;
3617
3618   /* Keep track of the VF for each mode.  Initialize all to 0 which indicates
3619      a mode has not been analyzed.  */
3620   auto_vec<poly_uint64, 8> cached_vf_per_mode;
3621   for (unsigned i = 0; i < vector_modes.length (); ++i)
3622     cached_vf_per_mode.safe_push (0);
3623
3624   /* First determine the main loop vectorization mode, either the first
3625      one that works, starting with auto-detecting the vector mode and then
3626      following the targets order of preference, or the one with the
3627      lowest cost if pick_lowest_cost_p.  */
3628   while (1)
3629     {
3630       bool fatal;
3631       unsigned int last_mode_i = mode_i;
3632       /* Set cached VF to -1 prior to analysis, which indicates a mode has
3633          failed.  */
3634       cached_vf_per_mode[last_mode_i] = -1;
3635       opt_loop_vec_info loop_vinfo
3636         = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3637                                NULL, vector_modes, mode_i,
3638                                autodetected_vector_mode, fatal);
3639       if (fatal)
3640         break;
3641
3642       if (loop_vinfo)
3643         {
3644           /*  Analyzis has been successful so update the VF value.  The
3645               VF should always be a multiple of unroll_factor and we want to
3646               capture the original VF here.  */
3647           cached_vf_per_mode[last_mode_i]
3648             = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
3649                          loop_vinfo->suggested_unroll_factor);
3650           /* Once we hit the desired simdlen for the first time,
3651              discard any previous attempts.  */
3652           if (simdlen
3653               && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3654             {
3655               delete first_loop_vinfo;
3656               first_loop_vinfo = opt_loop_vec_info::success (NULL);
3657               simdlen = 0;
3658             }
3659           else if (pick_lowest_cost_p
3660                    && first_loop_vinfo
3661                    && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3662             {
3663               /* Pick loop_vinfo over first_loop_vinfo.  */
3664               delete first_loop_vinfo;
3665               first_loop_vinfo = opt_loop_vec_info::success (NULL);
3666             }
3667           if (first_loop_vinfo == NULL)
3668             first_loop_vinfo = loop_vinfo;
3669           else
3670             {
3671               delete loop_vinfo;
3672               loop_vinfo = opt_loop_vec_info::success (NULL);
3673             }
3674
3675           /* Commit to first_loop_vinfo if we have no reason to try
3676              alternatives.  */
3677           if (!simdlen && !pick_lowest_cost_p)
3678             break;
3679         }
3680       if (mode_i == vector_modes.length ()
3681           || autodetected_vector_mode == VOIDmode)
3682         break;
3683
3684       /* Try the next biggest vector size.  */
3685       if (dump_enabled_p ())
3686         dump_printf_loc (MSG_NOTE, vect_location,
3687                          "***** Re-trying analysis with vector mode %s\n",
3688                          GET_MODE_NAME (vector_modes[mode_i]));
3689     }
3690   if (!first_loop_vinfo)
3691     return opt_loop_vec_info::propagate_failure (res);
3692
3693   if (dump_enabled_p ())
3694     dump_printf_loc (MSG_NOTE, vect_location,
3695                      "***** Choosing vector mode %s\n",
3696                      GET_MODE_NAME (first_loop_vinfo->vector_mode));
3697
3698   /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3699      enabled, SIMDUID is not set, it is the innermost loop and we have
3700      either already found the loop's SIMDLEN or there was no SIMDLEN to
3701      begin with.
3702      TODO: Enable epilogue vectorization for loops with SIMDUID set.  */
3703   bool vect_epilogues = (!simdlen
3704                          && loop->inner == NULL
3705                          && param_vect_epilogues_nomask
3706                          && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3707                            /* No code motion support for multiple epilogues so for now
3708                               not supported when multiple exits.  */
3709                          && !LOOP_VINFO_EARLY_BREAKS (first_loop_vinfo)
3710                          && !loop->simduid);
3711   if (!vect_epilogues)
3712     return first_loop_vinfo;
3713
3714   /* Now analyze first_loop_vinfo for epilogue vectorization.  */
3715   poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3716
3717   /* For epilogues start the analysis from the first mode.  The motivation
3718      behind starting from the beginning comes from cases where the VECTOR_MODES
3719      array may contain length-agnostic and length-specific modes.  Their
3720      ordering is not guaranteed, so we could end up picking a mode for the main
3721      loop that is after the epilogue's optimal mode.  */
3722   vector_modes[0] = autodetected_vector_mode;
3723   mode_i = 0;
3724
3725   bool supports_partial_vectors =
3726     partial_vectors_supported_p () && param_vect_partial_vector_usage != 0;
3727   poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3728
3729   while (1)
3730     {
3731       /* If the target does not support partial vectors we can shorten the
3732          number of modes to analyze for the epilogue as we know we can't pick a
3733          mode that would lead to a VF at least as big as the
3734          FIRST_VINFO_VF.  */
3735       if (!supports_partial_vectors
3736           && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3737         {
3738           mode_i++;
3739           if (mode_i == vector_modes.length ())
3740             break;
3741           continue;
3742         }
3743
3744       if (dump_enabled_p ())
3745         dump_printf_loc (MSG_NOTE, vect_location,
3746                          "***** Re-trying epilogue analysis with vector "
3747                          "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3748
3749       bool fatal;
3750       opt_loop_vec_info loop_vinfo
3751         = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3752                                first_loop_vinfo,
3753                                vector_modes, mode_i,
3754                                autodetected_vector_mode, fatal);
3755       if (fatal)
3756         break;
3757
3758       if (loop_vinfo)
3759         {
3760           if (pick_lowest_cost_p)
3761             {
3762               /* Keep trying to roll back vectorization attempts while the
3763                  loop_vec_infos they produced were worse than this one.  */
3764               vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3765               while (!vinfos.is_empty ()
3766                      && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3767                 {
3768                   gcc_assert (vect_epilogues);
3769                   delete vinfos.pop ();
3770                 }
3771             }
3772           /* For now only allow one epilogue loop.  */
3773           if (first_loop_vinfo->epilogue_vinfos.is_empty ())
3774             {
3775               first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3776               poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3777               gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3778                           || maybe_ne (lowest_th, 0U));
3779               /* Keep track of the known smallest versioning
3780                  threshold.  */
3781               if (ordered_p (lowest_th, th))
3782                 lowest_th = ordered_min (lowest_th, th);
3783             }
3784           else
3785             {
3786               delete loop_vinfo;
3787               loop_vinfo = opt_loop_vec_info::success (NULL);
3788             }
3789
3790           /* For now only allow one epilogue loop, but allow
3791              pick_lowest_cost_p to replace it, so commit to the
3792              first epilogue if we have no reason to try alternatives.  */
3793           if (!pick_lowest_cost_p)
3794             break;
3795         }
3796
3797       if (mode_i == vector_modes.length ())
3798         break;
3799
3800     }
3801
3802   if (!first_loop_vinfo->epilogue_vinfos.is_empty ())
3803     {
3804       LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3805       if (dump_enabled_p ())
3806         dump_printf_loc (MSG_NOTE, vect_location,
3807                          "***** Choosing epilogue vector mode %s\n",
3808                          GET_MODE_NAME
3809                            (first_loop_vinfo->epilogue_vinfos[0]->vector_mode));
3810     }
3811
3812   return first_loop_vinfo;
3813 }
3814
3815 /* Return true if there is an in-order reduction function for CODE, storing
3816    it in *REDUC_FN if so.  */
3817
3818 static bool
3819 fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3820 {
3821   /* We support MINUS_EXPR by negating the operand.  This also preserves an
3822      initial -0.0 since -0.0 - 0.0 (neutral op for MINUS_EXPR) == -0.0 +
3823      (-0.0) = -0.0.  */
3824   if (code == PLUS_EXPR || code == MINUS_EXPR)
3825     {
3826       *reduc_fn = IFN_FOLD_LEFT_PLUS;
3827       return true;
3828     }
3829   return false;
3830 }
3831
3832 /* Function reduction_fn_for_scalar_code
3833
3834    Input:
3835    CODE - tree_code of a reduction operations.
3836
3837    Output:
3838    REDUC_FN - the corresponding internal function to be used to reduce the
3839       vector of partial results into a single scalar result, or IFN_LAST
3840       if the operation is a supported reduction operation, but does not have
3841       such an internal function.
3842
3843    Return FALSE if CODE currently cannot be vectorized as reduction.  */
3844
3845 bool
3846 reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3847 {
3848   if (code.is_tree_code ())
3849     switch (tree_code (code))
3850       {
3851       case MAX_EXPR:
3852         *reduc_fn = IFN_REDUC_MAX;
3853         return true;
3854
3855       case MIN_EXPR:
3856         *reduc_fn = IFN_REDUC_MIN;
3857         return true;
3858
3859       case PLUS_EXPR:
3860         *reduc_fn = IFN_REDUC_PLUS;
3861         return true;
3862
3863       case BIT_AND_EXPR:
3864         *reduc_fn = IFN_REDUC_AND;
3865         return true;
3866
3867       case BIT_IOR_EXPR:
3868         *reduc_fn = IFN_REDUC_IOR;
3869         return true;
3870
3871       case BIT_XOR_EXPR:
3872         *reduc_fn = IFN_REDUC_XOR;
3873         return true;
3874
3875       case MULT_EXPR:
3876       case MINUS_EXPR:
3877         *reduc_fn = IFN_LAST;
3878         return true;
3879
3880       default:
3881         return false;
3882       }
3883   else
3884     switch (combined_fn (code))
3885       {
3886       CASE_CFN_FMAX:
3887         *reduc_fn = IFN_REDUC_FMAX;
3888         return true;
3889
3890       CASE_CFN_FMIN:
3891         *reduc_fn = IFN_REDUC_FMIN;
3892         return true;
3893
3894       default:
3895         return false;
3896       }
3897 }
3898
3899 /* If there is a neutral value X such that a reduction would not be affected
3900    by the introduction of additional X elements, return that X, otherwise
3901    return null.  CODE is the code of the reduction and SCALAR_TYPE is type
3902    of the scalar elements.  If the reduction has just a single initial value
3903    then INITIAL_VALUE is that value, otherwise it is null.
3904    If AS_INITIAL is TRUE the value is supposed to be used as initial value.
3905    In that case no signed zero is returned.  */
3906
3907 tree
3908 neutral_op_for_reduction (tree scalar_type, code_helper code,
3909                           tree initial_value, bool as_initial)
3910 {
3911   if (code.is_tree_code ())
3912     switch (tree_code (code))
3913       {
3914       case DOT_PROD_EXPR:
3915       case SAD_EXPR:
3916       case MINUS_EXPR:
3917       case BIT_IOR_EXPR:
3918       case BIT_XOR_EXPR:
3919         return build_zero_cst (scalar_type);
3920       case WIDEN_SUM_EXPR:
3921       case PLUS_EXPR:
3922         if (!as_initial && HONOR_SIGNED_ZEROS (scalar_type))
3923           return build_real (scalar_type, dconstm0);
3924         else
3925           return build_zero_cst (scalar_type);
3926
3927       case MULT_EXPR:
3928         return build_one_cst (scalar_type);
3929
3930       case BIT_AND_EXPR:
3931         return build_all_ones_cst (scalar_type);
3932
3933       case MAX_EXPR:
3934       case MIN_EXPR:
3935         return initial_value;
3936
3937       default:
3938         return NULL_TREE;
3939       }
3940   else
3941     switch (combined_fn (code))
3942       {
3943       CASE_CFN_FMIN:
3944       CASE_CFN_FMAX:
3945         return initial_value;
3946
3947       default:
3948         return NULL_TREE;
3949       }
3950 }
3951
3952 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
3953    STMT is printed with a message MSG. */
3954
3955 static void
3956 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3957 {
3958   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3959 }
3960
3961 /* Return true if we need an in-order reduction for operation CODE
3962    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3963    overflow must wrap.  */
3964
3965 bool
3966 needs_fold_left_reduction_p (tree type, code_helper code)
3967 {
3968   /* CHECKME: check for !flag_finite_math_only too?  */
3969   if (SCALAR_FLOAT_TYPE_P (type))
3970     {
3971       if (code.is_tree_code ())
3972         switch (tree_code (code))
3973           {
3974           case MIN_EXPR:
3975           case MAX_EXPR:
3976             return false;
3977
3978           default:
3979             return !flag_associative_math;
3980           }
3981       else
3982         switch (combined_fn (code))
3983           {
3984           CASE_CFN_FMIN:
3985           CASE_CFN_FMAX:
3986             return false;
3987
3988           default:
3989             return !flag_associative_math;
3990           }
3991     }
3992
3993   if (INTEGRAL_TYPE_P (type))
3994     return (!code.is_tree_code ()
3995             || !operation_no_trapping_overflow (type, tree_code (code)));
3996
3997   if (SAT_FIXED_POINT_TYPE_P (type))
3998     return true;
3999
4000   return false;
4001 }
4002
4003 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
4004    has a handled computation expression.  Store the main reduction
4005    operation in *CODE.  */
4006
4007 static bool
4008 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
4009                       tree loop_arg, code_helper *code,
4010                       vec<std::pair<ssa_op_iter, use_operand_p> > &path)
4011 {
4012   auto_bitmap visited;
4013   tree lookfor = PHI_RESULT (phi);
4014   ssa_op_iter curri;
4015   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
4016   while (USE_FROM_PTR (curr) != loop_arg)
4017     curr = op_iter_next_use (&curri);
4018   curri.i = curri.numops;
4019   do
4020     {
4021       path.safe_push (std::make_pair (curri, curr));
4022       tree use = USE_FROM_PTR (curr);
4023       if (use == lookfor)
4024         break;
4025       gimple *def = SSA_NAME_DEF_STMT (use);
4026       if (gimple_nop_p (def)
4027           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
4028         {
4029 pop:
4030           do
4031             {
4032               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
4033               curri = x.first;
4034               curr = x.second;
4035               do
4036                 curr = op_iter_next_use (&curri);
4037               /* Skip already visited or non-SSA operands (from iterating
4038                  over PHI args).  */
4039               while (curr != NULL_USE_OPERAND_P
4040                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
4041                          || ! bitmap_set_bit (visited,
4042                                               SSA_NAME_VERSION
4043                                                 (USE_FROM_PTR (curr)))));
4044             }
4045           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
4046           if (curr == NULL_USE_OPERAND_P)
4047             break;
4048         }
4049       else
4050         {
4051           if (gimple_code (def) == GIMPLE_PHI)
4052             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
4053           else
4054             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
4055           while (curr != NULL_USE_OPERAND_P
4056                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
4057                      || ! bitmap_set_bit (visited,
4058                                           SSA_NAME_VERSION
4059                                             (USE_FROM_PTR (curr)))))
4060             curr = op_iter_next_use (&curri);
4061           if (curr == NULL_USE_OPERAND_P)
4062             goto pop;
4063         }
4064     }
4065   while (1);
4066   if (dump_file && (dump_flags & TDF_DETAILS))
4067     {
4068       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
4069       unsigned i;
4070       std::pair<ssa_op_iter, use_operand_p> *x;
4071       FOR_EACH_VEC_ELT (path, i, x)
4072         dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
4073       dump_printf (MSG_NOTE, "\n");
4074     }
4075
4076   /* Check whether the reduction path detected is valid.  */
4077   bool fail = path.length () == 0;
4078   bool neg = false;
4079   int sign = -1;
4080   *code = ERROR_MARK;
4081   for (unsigned i = 1; i < path.length (); ++i)
4082     {
4083       gimple *use_stmt = USE_STMT (path[i].second);
4084       gimple_match_op op;
4085       if (!gimple_extract_op (use_stmt, &op))
4086         {
4087           fail = true;
4088           break;
4089         }
4090       unsigned int opi = op.num_ops;
4091       if (gassign *assign = dyn_cast<gassign *> (use_stmt))
4092         {
4093           /* The following make sure we can compute the operand index
4094              easily plus it mostly disallows chaining via COND_EXPR condition
4095              operands.  */
4096           for (opi = 0; opi < op.num_ops; ++opi)
4097             if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
4098               break;
4099         }
4100       else if (gcall *call = dyn_cast<gcall *> (use_stmt))
4101         {
4102           for (opi = 0; opi < op.num_ops; ++opi)
4103             if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
4104               break;
4105         }
4106       if (opi == op.num_ops)
4107         {
4108           fail = true;
4109           break;
4110         }
4111       op.code = canonicalize_code (op.code, op.type);
4112       if (op.code == MINUS_EXPR)
4113         {
4114           op.code = PLUS_EXPR;
4115           /* Track whether we negate the reduction value each iteration.  */
4116           if (op.ops[1] == op.ops[opi])
4117             neg = ! neg;
4118         }
4119       else if (op.code == IFN_COND_SUB)
4120         {
4121           op.code = IFN_COND_ADD;
4122           /* Track whether we negate the reduction value each iteration.  */
4123           if (op.ops[2] == op.ops[opi])
4124             neg = ! neg;
4125         }
4126       if (CONVERT_EXPR_CODE_P (op.code)
4127           && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
4128         ;
4129       else if (*code == ERROR_MARK)
4130         {
4131           *code = op.code;
4132           sign = TYPE_SIGN (op.type);
4133         }
4134       else if (op.code != *code)
4135         {
4136           fail = true;
4137           break;
4138         }
4139       else if ((op.code == MIN_EXPR
4140                 || op.code == MAX_EXPR)
4141                && sign != TYPE_SIGN (op.type))
4142         {
4143           fail = true;
4144           break;
4145         }
4146       /* Check there's only a single stmt the op is used on.  For the
4147          not value-changing tail and the last stmt allow out-of-loop uses.
4148          ???  We could relax this and handle arbitrary live stmts by
4149          forcing a scalar epilogue for example.  */
4150       imm_use_iterator imm_iter;
4151       use_operand_p use_p;
4152       gimple *op_use_stmt;
4153       unsigned cnt = 0;
4154       bool cond_fn_p = op.code.is_internal_fn ()
4155         && (conditional_internal_fn_code (internal_fn (op.code))
4156             != ERROR_MARK);
4157
4158       FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
4159         {
4160         /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
4161            op1 twice (once as definition, once as else) in the same operation.
4162            Allow this.  */
4163           if (cond_fn_p && op_use_stmt == use_stmt)
4164             {
4165               gcall *call = as_a<gcall *> (use_stmt);
4166               unsigned else_pos
4167                 = internal_fn_else_index (internal_fn (op.code));
4168
4169               for (unsigned int j = 0; j < gimple_call_num_args (call); ++j)
4170                 {
4171                   if (j == else_pos)
4172                     continue;
4173                   if (gimple_call_arg (call, j) == op.ops[opi])
4174                     cnt++;
4175                 }
4176             }
4177           else if (!is_gimple_debug (op_use_stmt)
4178                    && (*code != ERROR_MARK
4179                        || flow_bb_inside_loop_p (loop,
4180                                                  gimple_bb (op_use_stmt))))
4181             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4182               cnt++;
4183         }
4184
4185       if (cnt != 1)
4186         {
4187           fail = true;
4188           break;
4189         }
4190     }
4191   return ! fail && ! neg && *code != ERROR_MARK;
4192 }
4193
4194 bool
4195 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
4196                       tree loop_arg, enum tree_code code)
4197 {
4198   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4199   code_helper code_;
4200   return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
4201           && code_ == code);
4202 }
4203
4204
4205
4206 /* Function vect_is_simple_reduction
4207
4208    (1) Detect a cross-iteration def-use cycle that represents a simple
4209    reduction computation.  We look for the following pattern:
4210
4211    loop_header:
4212      a1 = phi < a0, a2 >
4213      a3 = ...
4214      a2 = operation (a3, a1)
4215
4216    or
4217
4218    a3 = ...
4219    loop_header:
4220      a1 = phi < a0, a2 >
4221      a2 = operation (a3, a1)
4222
4223    such that:
4224    1. operation is commutative and associative and it is safe to
4225       change the order of the computation
4226    2. no uses for a2 in the loop (a2 is used out of the loop)
4227    3. no uses of a1 in the loop besides the reduction operation
4228    4. no uses of a1 outside the loop.
4229
4230    Conditions 1,4 are tested here.
4231    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
4232
4233    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
4234    nested cycles.
4235
4236    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
4237    reductions:
4238
4239      a1 = phi < a0, a2 >
4240      inner loop (def of a3)
4241      a2 = phi < a3 >
4242
4243    (4) Detect condition expressions, ie:
4244      for (int i = 0; i < N; i++)
4245        if (a[i] < val)
4246         ret_val = a[i];
4247
4248 */
4249
4250 static stmt_vec_info
4251 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
4252                           bool *double_reduc, bool *reduc_chain_p, bool slp)
4253 {
4254   gphi *phi = as_a <gphi *> (phi_info->stmt);
4255   gimple *phi_use_stmt = NULL;
4256   imm_use_iterator imm_iter;
4257   use_operand_p use_p;
4258
4259   *double_reduc = false;
4260   *reduc_chain_p = false;
4261   STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
4262
4263   tree phi_name = PHI_RESULT (phi);
4264   /* ???  If there are no uses of the PHI result the inner loop reduction
4265      won't be detected as possibly double-reduction by vectorizable_reduction
4266      because that tries to walk the PHI arg from the preheader edge which
4267      can be constant.  See PR60382.  */
4268   if (has_zero_uses (phi_name))
4269     return NULL;
4270   class loop *loop = (gimple_bb (phi))->loop_father;
4271   unsigned nphi_def_loop_uses = 0;
4272   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
4273     {
4274       gimple *use_stmt = USE_STMT (use_p);
4275       if (is_gimple_debug (use_stmt))
4276         continue;
4277
4278       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4279         {
4280           if (dump_enabled_p ())
4281             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4282                              "intermediate value used outside loop.\n");
4283
4284           return NULL;
4285         }
4286
4287       /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
4288          op1 twice (once as definition, once as else) in the same operation.
4289          Only count it as one. */
4290       if (use_stmt != phi_use_stmt)
4291         {
4292           nphi_def_loop_uses++;
4293           phi_use_stmt = use_stmt;
4294         }
4295     }
4296
4297   tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
4298   if (TREE_CODE (latch_def) != SSA_NAME)
4299     {
4300       if (dump_enabled_p ())
4301         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4302                          "reduction: not ssa_name: %T\n", latch_def);
4303       return NULL;
4304     }
4305
4306   stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
4307   if (!def_stmt_info
4308       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
4309     return NULL;
4310
4311   bool nested_in_vect_loop
4312     = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
4313   unsigned nlatch_def_loop_uses = 0;
4314   auto_vec<gphi *, 3> lcphis;
4315   bool inner_loop_of_double_reduc = false;
4316   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
4317     {
4318       gimple *use_stmt = USE_STMT (use_p);
4319       if (is_gimple_debug (use_stmt))
4320         continue;
4321       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4322         nlatch_def_loop_uses++;
4323       else
4324         {
4325           /* We can have more than one loop-closed PHI.  */
4326           lcphis.safe_push (as_a <gphi *> (use_stmt));
4327           if (nested_in_vect_loop
4328               && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
4329                   == vect_double_reduction_def))
4330             inner_loop_of_double_reduc = true;
4331         }
4332     }
4333
4334   /* If we are vectorizing an inner reduction we are executing that
4335      in the original order only in case we are not dealing with a
4336      double reduction.  */
4337   if (nested_in_vect_loop && !inner_loop_of_double_reduc)
4338     {
4339       if (dump_enabled_p ())
4340         report_vect_op (MSG_NOTE, def_stmt_info->stmt,
4341                         "detected nested cycle: ");
4342       return def_stmt_info;
4343     }
4344
4345   /* When the inner loop of a double reduction ends up with more than
4346      one loop-closed PHI we have failed to classify alternate such
4347      PHIs as double reduction, leading to wrong code.  See PR103237.  */
4348   if (inner_loop_of_double_reduc && lcphis.length () != 1)
4349     {
4350       if (dump_enabled_p ())
4351         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4352                          "unhandle double reduction\n");
4353       return NULL;
4354     }
4355
4356   /* If this isn't a nested cycle or if the nested cycle reduction value
4357      is used ouside of the inner loop we cannot handle uses of the reduction
4358      value.  */
4359   if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
4360     {
4361       if (dump_enabled_p ())
4362         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4363                          "reduction used in loop.\n");
4364       return NULL;
4365     }
4366
4367   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
4368      defined in the inner loop.  */
4369   if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
4370     {
4371       tree op1 = PHI_ARG_DEF (def_stmt, 0);
4372       if (gimple_phi_num_args (def_stmt) != 1
4373           || TREE_CODE (op1) != SSA_NAME)
4374         {
4375           if (dump_enabled_p ())
4376             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4377                              "unsupported phi node definition.\n");
4378
4379           return NULL;
4380         }
4381
4382       /* Verify there is an inner cycle composed of the PHI phi_use_stmt
4383          and the latch definition op1.  */
4384       gimple *def1 = SSA_NAME_DEF_STMT (op1);
4385       if (gimple_bb (def1)
4386           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4387           && loop->inner
4388           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
4389           && (is_gimple_assign (def1) || is_gimple_call (def1))
4390           && is_a <gphi *> (phi_use_stmt)
4391           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt))
4392           && (op1 == PHI_ARG_DEF_FROM_EDGE (phi_use_stmt,
4393                                             loop_latch_edge (loop->inner))))
4394         {
4395           if (dump_enabled_p ())
4396             report_vect_op (MSG_NOTE, def_stmt,
4397                             "detected double reduction: ");
4398
4399           *double_reduc = true;
4400           return def_stmt_info;
4401         }
4402
4403       return NULL;
4404     }
4405
4406   /* Look for the expression computing latch_def from then loop PHI result.  */
4407   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4408   code_helper code;
4409   if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
4410                             path))
4411     {
4412       STMT_VINFO_REDUC_CODE (phi_info) = code;
4413       if (code == COND_EXPR && !nested_in_vect_loop)
4414         STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
4415
4416       /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
4417          reduction chain for which the additional restriction is that
4418          all operations in the chain are the same.  */
4419       auto_vec<stmt_vec_info, 8> reduc_chain;
4420       unsigned i;
4421       bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
4422       for (i = path.length () - 1; i >= 1; --i)
4423         {
4424           gimple *stmt = USE_STMT (path[i].second);
4425           stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
4426           gimple_match_op op;
4427           if (!gimple_extract_op (stmt, &op))
4428             gcc_unreachable ();
4429           if (gassign *assign = dyn_cast<gassign *> (stmt))
4430             STMT_VINFO_REDUC_IDX (stmt_info)
4431               = path[i].second->use - gimple_assign_rhs1_ptr (assign);
4432           else
4433             {
4434               gcall *call = as_a<gcall *> (stmt);
4435               STMT_VINFO_REDUC_IDX (stmt_info)
4436                 = path[i].second->use - gimple_call_arg_ptr (call, 0);
4437             }
4438           bool leading_conversion = (CONVERT_EXPR_CODE_P (op.code)
4439                                      && (i == 1 || i == path.length () - 1));
4440           if ((op.code != code && !leading_conversion)
4441               /* We can only handle the final value in epilogue
4442                  generation for reduction chains.  */
4443               || (i != 1 && !has_single_use (gimple_get_lhs (stmt))))
4444             is_slp_reduc = false;
4445           /* For reduction chains we support a trailing/leading
4446              conversions.  We do not store those in the actual chain.  */
4447           if (leading_conversion)
4448             continue;
4449           reduc_chain.safe_push (stmt_info);
4450         }
4451       if (slp && is_slp_reduc && reduc_chain.length () > 1)
4452         {
4453           for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
4454             {
4455               REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
4456               REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
4457             }
4458           REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
4459           REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
4460
4461           /* Save the chain for further analysis in SLP detection.  */
4462           LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
4463           REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
4464
4465           *reduc_chain_p = true;
4466           if (dump_enabled_p ())
4467             dump_printf_loc (MSG_NOTE, vect_location,
4468                             "reduction: detected reduction chain\n");
4469         }
4470       else if (dump_enabled_p ())
4471         dump_printf_loc (MSG_NOTE, vect_location,
4472                          "reduction: detected reduction\n");
4473
4474       return def_stmt_info;
4475     }
4476
4477   if (dump_enabled_p ())
4478     dump_printf_loc (MSG_NOTE, vect_location,
4479                      "reduction: unknown pattern\n");
4480
4481   return NULL;
4482 }
4483
4484 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
4485    PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
4486    or -1 if not known.  */
4487
4488 static int
4489 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
4490 {
4491   int assumed_vf = vect_vf_for_cost (loop_vinfo);
4492   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
4493     {
4494       if (dump_enabled_p ())
4495         dump_printf_loc (MSG_NOTE, vect_location,
4496                          "cost model: epilogue peel iters set to vf/2 "
4497                          "because loop iterations are unknown .\n");
4498       return assumed_vf / 2;
4499     }
4500   else
4501     {
4502       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
4503       peel_iters_prologue = MIN (niters, peel_iters_prologue);
4504       int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
4505       /* If we need to peel for gaps, but no peeling is required, we have to
4506          peel VF iterations.  */
4507       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
4508         peel_iters_epilogue = assumed_vf;
4509       return peel_iters_epilogue;
4510     }
4511 }
4512
4513 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
4514 int
4515 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
4516                              int *peel_iters_epilogue,
4517                              stmt_vector_for_cost *scalar_cost_vec,
4518                              stmt_vector_for_cost *prologue_cost_vec,
4519                              stmt_vector_for_cost *epilogue_cost_vec)
4520 {
4521   int retval = 0;
4522
4523   *peel_iters_epilogue
4524     = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
4525
4526   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
4527     {
4528       /* If peeled iterations are known but number of scalar loop
4529          iterations are unknown, count a taken branch per peeled loop.  */
4530       if (peel_iters_prologue > 0)
4531         retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
4532                                    vect_prologue);
4533       if (*peel_iters_epilogue > 0)
4534         retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
4535                                     vect_epilogue);
4536     }
4537
4538   stmt_info_for_cost *si;
4539   int j;
4540   if (peel_iters_prologue)
4541     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4542       retval += record_stmt_cost (prologue_cost_vec,
4543                                   si->count * peel_iters_prologue,
4544                                   si->kind, si->stmt_info, si->misalign,
4545                                   vect_prologue);
4546   if (*peel_iters_epilogue)
4547     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4548       retval += record_stmt_cost (epilogue_cost_vec,
4549                                   si->count * *peel_iters_epilogue,
4550                                   si->kind, si->stmt_info, si->misalign,
4551                                   vect_epilogue);
4552
4553   return retval;
4554 }
4555
4556 /* Function vect_estimate_min_profitable_iters
4557
4558    Return the number of iterations required for the vector version of the
4559    loop to be profitable relative to the cost of the scalar version of the
4560    loop.
4561
4562    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
4563    of iterations for vectorization.  -1 value means loop vectorization
4564    is not profitable.  This returned value may be used for dynamic
4565    profitability check.
4566
4567    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
4568    for static check against estimated number of iterations.  */
4569
4570 static void
4571 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
4572                                     int *ret_min_profitable_niters,
4573                                     int *ret_min_profitable_estimate,
4574                                     unsigned *suggested_unroll_factor)
4575 {
4576   int min_profitable_iters;
4577   int min_profitable_estimate;
4578   int peel_iters_prologue;
4579   int peel_iters_epilogue;
4580   unsigned vec_inside_cost = 0;
4581   int vec_outside_cost = 0;
4582   unsigned vec_prologue_cost = 0;
4583   unsigned vec_epilogue_cost = 0;
4584   int scalar_single_iter_cost = 0;
4585   int scalar_outside_cost = 0;
4586   int assumed_vf = vect_vf_for_cost (loop_vinfo);
4587   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
4588   vector_costs *target_cost_data = loop_vinfo->vector_costs;
4589
4590   /* Cost model disabled.  */
4591   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
4592     {
4593       if (dump_enabled_p ())
4594         dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
4595       *ret_min_profitable_niters = 0;
4596       *ret_min_profitable_estimate = 0;
4597       return;
4598     }
4599
4600   /* Requires loop versioning tests to handle misalignment.  */
4601   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
4602     {
4603       /*  FIXME: Make cost depend on complexity of individual check.  */
4604       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
4605       (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4606       if (dump_enabled_p ())
4607         dump_printf (MSG_NOTE,
4608                      "cost model: Adding cost of checks for loop "
4609                      "versioning to treat misalignment.\n");
4610     }
4611
4612   /* Requires loop versioning with alias checks.  */
4613   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4614     {
4615       /*  FIXME: Make cost depend on complexity of individual check.  */
4616       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
4617       (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4618       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
4619       if (len)
4620         /* Count LEN - 1 ANDs and LEN comparisons.  */
4621         (void) add_stmt_cost (target_cost_data, len * 2 - 1,
4622                               scalar_stmt, vect_prologue);
4623       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
4624       if (len)
4625         {
4626           /* Count LEN - 1 ANDs and LEN comparisons.  */
4627           unsigned int nstmts = len * 2 - 1;
4628           /* +1 for each bias that needs adding.  */
4629           for (unsigned int i = 0; i < len; ++i)
4630             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
4631               nstmts += 1;
4632           (void) add_stmt_cost (target_cost_data, nstmts,
4633                                 scalar_stmt, vect_prologue);
4634         }
4635       if (dump_enabled_p ())
4636         dump_printf (MSG_NOTE,
4637                      "cost model: Adding cost of checks for loop "
4638                      "versioning aliasing.\n");
4639     }
4640
4641   /* Requires loop versioning with niter checks.  */
4642   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
4643     {
4644       /*  FIXME: Make cost depend on complexity of individual check.  */
4645       (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
4646                             NULL, NULL, NULL_TREE, 0, vect_prologue);
4647       if (dump_enabled_p ())
4648         dump_printf (MSG_NOTE,
4649                      "cost model: Adding cost of checks for loop "
4650                      "versioning niters.\n");
4651     }
4652
4653   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4654     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4655                           vect_prologue);
4656
4657   /* Count statements in scalar loop.  Using this as scalar cost for a single
4658      iteration for now.
4659
4660      TODO: Add outer loop support.
4661
4662      TODO: Consider assigning different costs to different scalar
4663      statements.  */
4664
4665   scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost ();
4666
4667   /* Add additional cost for the peeled instructions in prologue and epilogue
4668      loop.  (For fully-masked loops there will be no peeling.)
4669
4670      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4671      at compile-time - we assume it's vf/2 (the worst would be vf-1).
4672
4673      TODO: Build an expression that represents peel_iters for prologue and
4674      epilogue to be used in a run-time test.  */
4675
4676   bool prologue_need_br_taken_cost = false;
4677   bool prologue_need_br_not_taken_cost = false;
4678
4679   /* Calculate peel_iters_prologue.  */
4680   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4681     peel_iters_prologue = 0;
4682   else if (npeel < 0)
4683     {
4684       peel_iters_prologue = assumed_vf / 2;
4685       if (dump_enabled_p ())
4686         dump_printf (MSG_NOTE, "cost model: "
4687                      "prologue peel iters set to vf/2.\n");
4688
4689       /* If peeled iterations are unknown, count a taken branch and a not taken
4690          branch per peeled loop.  Even if scalar loop iterations are known,
4691          vector iterations are not known since peeled prologue iterations are
4692          not known.  Hence guards remain the same.  */
4693       prologue_need_br_taken_cost = true;
4694       prologue_need_br_not_taken_cost = true;
4695     }
4696   else
4697     {
4698       peel_iters_prologue = npeel;
4699       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4700         /* If peeled iterations are known but number of scalar loop
4701            iterations are unknown, count a taken branch per peeled loop.  */
4702         prologue_need_br_taken_cost = true;
4703     }
4704
4705   bool epilogue_need_br_taken_cost = false;
4706   bool epilogue_need_br_not_taken_cost = false;
4707
4708   /* Calculate peel_iters_epilogue.  */
4709   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4710     /* We need to peel exactly one iteration for gaps.  */
4711     peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4712   else if (npeel < 0)
4713     {
4714       /* If peeling for alignment is unknown, loop bound of main loop
4715          becomes unknown.  */
4716       peel_iters_epilogue = assumed_vf / 2;
4717       if (dump_enabled_p ())
4718         dump_printf (MSG_NOTE, "cost model: "
4719                      "epilogue peel iters set to vf/2 because "
4720                      "peeling for alignment is unknown.\n");
4721
4722       /* See the same reason above in peel_iters_prologue calculation.  */
4723       epilogue_need_br_taken_cost = true;
4724       epilogue_need_br_not_taken_cost = true;
4725     }
4726   else
4727     {
4728       peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4729       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4730         /* If peeled iterations are known but number of scalar loop
4731            iterations are unknown, count a taken branch per peeled loop.  */
4732         epilogue_need_br_taken_cost = true;
4733     }
4734
4735   stmt_info_for_cost *si;
4736   int j;
4737   /* Add costs associated with peel_iters_prologue.  */
4738   if (peel_iters_prologue)
4739     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4740       {
4741         (void) add_stmt_cost (target_cost_data,
4742                               si->count * peel_iters_prologue, si->kind,
4743                               si->stmt_info, si->node, si->vectype,
4744                               si->misalign, vect_prologue);
4745       }
4746
4747   /* Add costs associated with peel_iters_epilogue.  */
4748   if (peel_iters_epilogue)
4749     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4750       {
4751         (void) add_stmt_cost (target_cost_data,
4752                               si->count * peel_iters_epilogue, si->kind,
4753                               si->stmt_info, si->node, si->vectype,
4754                               si->misalign, vect_epilogue);
4755       }
4756
4757   /* Add possible cond_branch_taken/cond_branch_not_taken cost.  */
4758
4759   if (prologue_need_br_taken_cost)
4760     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4761                           vect_prologue);
4762
4763   if (prologue_need_br_not_taken_cost)
4764     (void) add_stmt_cost (target_cost_data, 1,
4765                           cond_branch_not_taken, vect_prologue);
4766
4767   if (epilogue_need_br_taken_cost)
4768     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4769                           vect_epilogue);
4770
4771   if (epilogue_need_br_not_taken_cost)
4772     (void) add_stmt_cost (target_cost_data, 1,
4773                           cond_branch_not_taken, vect_epilogue);
4774
4775   /* Take care of special costs for rgroup controls of partial vectors.  */
4776   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4777       && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4778           == vect_partial_vectors_avx512))
4779     {
4780       /* Calculate how many masks we need to generate.  */
4781       unsigned int num_masks = 0;
4782       bool need_saturation = false;
4783       for (auto rgm : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
4784         if (rgm.type)
4785           {
4786             unsigned nvectors = rgm.factor;
4787             num_masks += nvectors;
4788             if (TYPE_PRECISION (TREE_TYPE (rgm.compare_type))
4789                 < TYPE_PRECISION (LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo)))
4790               need_saturation = true;
4791           }
4792
4793       /* ???  The target isn't able to identify the costs below as
4794          producing masks so it cannot penaltize cases where we'd run
4795          out of mask registers for example.  */
4796
4797       /* ???  We are also failing to account for smaller vector masks
4798          we generate by splitting larger masks in vect_get_loop_mask.  */
4799
4800       /* In the worst case, we need to generate each mask in the prologue
4801          and in the loop body.  We need one splat per group and one
4802          compare per mask.
4803
4804          Sometimes the prologue mask will fold to a constant,
4805          so the actual prologue cost might be smaller.  However, it's
4806          simpler and safer to use the worst-case cost; if this ends up
4807          being the tie-breaker between vectorizing or not, then it's
4808          probably better not to vectorize.  */
4809       (void) add_stmt_cost (target_cost_data,
4810                             num_masks
4811                             + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4812                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4813                             vect_prologue);
4814       (void) add_stmt_cost (target_cost_data,
4815                             num_masks
4816                             + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4817                             vector_stmt, NULL, NULL, NULL_TREE, 0, vect_body);
4818
4819       /* When we need saturation we need it both in the prologue and
4820          the epilogue.  */
4821       if (need_saturation)
4822         {
4823           (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4824                                 NULL, NULL, NULL_TREE, 0, vect_prologue);
4825           (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4826                                 NULL, NULL, NULL_TREE, 0, vect_body);
4827         }
4828     }
4829   else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4830            && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4831                == vect_partial_vectors_while_ult))
4832     {
4833       /* Calculate how many masks we need to generate.  */
4834       unsigned int num_masks = 0;
4835       rgroup_controls *rgm;
4836       unsigned int num_vectors_m1;
4837       FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec,
4838                         num_vectors_m1, rgm)
4839         if (rgm->type)
4840           num_masks += num_vectors_m1 + 1;
4841       gcc_assert (num_masks > 0);
4842
4843       /* In the worst case, we need to generate each mask in the prologue
4844          and in the loop body.  One of the loop body mask instructions
4845          replaces the comparison in the scalar loop, and since we don't
4846          count the scalar comparison against the scalar body, we shouldn't
4847          count that vector instruction against the vector body either.
4848
4849          Sometimes we can use unpacks instead of generating prologue
4850          masks and sometimes the prologue mask will fold to a constant,
4851          so the actual prologue cost might be smaller.  However, it's
4852          simpler and safer to use the worst-case cost; if this ends up
4853          being the tie-breaker between vectorizing or not, then it's
4854          probably better not to vectorize.  */
4855       (void) add_stmt_cost (target_cost_data, num_masks,
4856                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4857                             vect_prologue);
4858       (void) add_stmt_cost (target_cost_data, num_masks - 1,
4859                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4860                             vect_body);
4861     }
4862   else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4863     {
4864       /* Referring to the functions vect_set_loop_condition_partial_vectors
4865          and vect_set_loop_controls_directly, we need to generate each
4866          length in the prologue and in the loop body if required. Although
4867          there are some possible optimizations, we consider the worst case
4868          here.  */
4869
4870       bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4871       signed char partial_load_store_bias
4872         = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4873       bool need_iterate_p
4874         = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4875            && !vect_known_niters_smaller_than_vf (loop_vinfo));
4876
4877       /* Calculate how many statements to be added.  */
4878       unsigned int prologue_stmts = 0;
4879       unsigned int body_stmts = 0;
4880
4881       rgroup_controls *rgc;
4882       unsigned int num_vectors_m1;
4883       FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4884         if (rgc->type)
4885           {
4886             /* May need one SHIFT for nitems_total computation.  */
4887             unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4888             if (nitems != 1 && !niters_known_p)
4889               prologue_stmts += 1;
4890
4891             /* May need one MAX and one MINUS for wrap around.  */
4892             if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4893               prologue_stmts += 2;
4894
4895             /* Need one MAX and one MINUS for each batch limit excepting for
4896                the 1st one.  */
4897             prologue_stmts += num_vectors_m1 * 2;
4898
4899             unsigned int num_vectors = num_vectors_m1 + 1;
4900
4901             /* Need to set up lengths in prologue, only one MIN required
4902                for each since start index is zero.  */
4903             prologue_stmts += num_vectors;
4904
4905             /* If we have a non-zero partial load bias, we need one PLUS
4906                to adjust the load length.  */
4907             if (partial_load_store_bias != 0)
4908               body_stmts += 1;
4909
4910             unsigned int length_update_cost = 0;
4911             if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
4912               /* For decrement IV style, Each only need a single SELECT_VL
4913                  or MIN since beginning to calculate the number of elements
4914                  need to be processed in current iteration.  */
4915               length_update_cost = 1;
4916             else
4917               /* For increment IV stype, Each may need two MINs and one MINUS to
4918                  update lengths in body for next iteration.  */
4919               length_update_cost = 3;
4920
4921             if (need_iterate_p)
4922               body_stmts += length_update_cost * num_vectors;
4923           }
4924
4925       (void) add_stmt_cost (target_cost_data, prologue_stmts,
4926                             scalar_stmt, vect_prologue);
4927       (void) add_stmt_cost (target_cost_data, body_stmts,
4928                             scalar_stmt, vect_body);
4929     }
4930
4931   /* FORNOW: The scalar outside cost is incremented in one of the
4932      following ways:
4933
4934      1. The vectorizer checks for alignment and aliasing and generates
4935      a condition that allows dynamic vectorization.  A cost model
4936      check is ANDED with the versioning condition.  Hence scalar code
4937      path now has the added cost of the versioning check.
4938
4939        if (cost > th & versioning_check)
4940          jmp to vector code
4941
4942      Hence run-time scalar is incremented by not-taken branch cost.
4943
4944      2. The vectorizer then checks if a prologue is required.  If the
4945      cost model check was not done before during versioning, it has to
4946      be done before the prologue check.
4947
4948        if (cost <= th)
4949          prologue = scalar_iters
4950        if (prologue == 0)
4951          jmp to vector code
4952        else
4953          execute prologue
4954        if (prologue == num_iters)
4955          go to exit
4956
4957      Hence the run-time scalar cost is incremented by a taken branch,
4958      plus a not-taken branch, plus a taken branch cost.
4959
4960      3. The vectorizer then checks if an epilogue is required.  If the
4961      cost model check was not done before during prologue check, it
4962      has to be done with the epilogue check.
4963
4964        if (prologue == 0)
4965          jmp to vector code
4966        else
4967          execute prologue
4968        if (prologue == num_iters)
4969          go to exit
4970        vector code:
4971          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4972            jmp to epilogue
4973
4974      Hence the run-time scalar cost should be incremented by 2 taken
4975      branches.
4976
4977      TODO: The back end may reorder the BBS's differently and reverse
4978      conditions/branch directions.  Change the estimates below to
4979      something more reasonable.  */
4980
4981   /* If the number of iterations is known and we do not do versioning, we can
4982      decide whether to vectorize at compile time.  Hence the scalar version
4983      do not carry cost model guard costs.  */
4984   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4985       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4986     {
4987       /* Cost model check occurs at versioning.  */
4988       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4989         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4990       else
4991         {
4992           /* Cost model check occurs at prologue generation.  */
4993           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4994             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4995               + vect_get_stmt_cost (cond_branch_not_taken);
4996           /* Cost model check occurs at epilogue generation.  */
4997           else
4998             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4999         }
5000     }
5001
5002   /* Complete the target-specific cost calculations.  */
5003   finish_cost (loop_vinfo->vector_costs, loop_vinfo->scalar_costs,
5004                &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost,
5005                suggested_unroll_factor);
5006
5007   if (suggested_unroll_factor && *suggested_unroll_factor > 1
5008       && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
5009       && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
5010                     *suggested_unroll_factor,
5011                     LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
5012     {
5013       if (dump_enabled_p ())
5014         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5015                          "can't unroll as unrolled vectorization factor larger"
5016                          " than maximum vectorization factor: "
5017                          HOST_WIDE_INT_PRINT_UNSIGNED "\n",
5018                          LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
5019       *suggested_unroll_factor = 1;
5020     }
5021
5022   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
5023
5024   if (dump_enabled_p ())
5025     {
5026       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
5027       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
5028                    vec_inside_cost);
5029       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
5030                    vec_prologue_cost);
5031       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
5032                    vec_epilogue_cost);
5033       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
5034                    scalar_single_iter_cost);
5035       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
5036                    scalar_outside_cost);
5037       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
5038                    vec_outside_cost);
5039       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
5040                    peel_iters_prologue);
5041       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
5042                    peel_iters_epilogue);
5043     }
5044
5045   /* Calculate number of iterations required to make the vector version
5046      profitable, relative to the loop bodies only.  The following condition
5047      must hold true:
5048      SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
5049      where
5050      SIC = scalar iteration cost, VIC = vector iteration cost,
5051      VOC = vector outside cost, VF = vectorization factor,
5052      NPEEL = prologue iterations + epilogue iterations,
5053      SOC = scalar outside cost for run time cost model check.  */
5054
5055   int saving_per_viter = (scalar_single_iter_cost * assumed_vf
5056                           - vec_inside_cost);
5057   if (saving_per_viter <= 0)
5058     {
5059       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
5060         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
5061                     "vectorization did not happen for a simd loop");
5062
5063       if (dump_enabled_p ())
5064         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5065                          "cost model: the vector iteration cost = %d "
5066                          "divided by the scalar iteration cost = %d "
5067                          "is greater or equal to the vectorization factor = %d"
5068                          ".\n",
5069                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
5070       *ret_min_profitable_niters = -1;
5071       *ret_min_profitable_estimate = -1;
5072       return;
5073     }
5074
5075   /* ??? The "if" arm is written to handle all cases; see below for what
5076      we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
5077   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5078     {
5079       /* Rewriting the condition above in terms of the number of
5080          vector iterations (vniters) rather than the number of
5081          scalar iterations (niters) gives:
5082
5083          SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
5084
5085          <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
5086
5087          For integer N, X and Y when X > 0:
5088
5089          N * X > Y <==> N >= (Y /[floor] X) + 1.  */
5090       int outside_overhead = (vec_outside_cost
5091                               - scalar_single_iter_cost * peel_iters_prologue
5092                               - scalar_single_iter_cost * peel_iters_epilogue
5093                               - scalar_outside_cost);
5094       /* We're only interested in cases that require at least one
5095          vector iteration.  */
5096       int min_vec_niters = 1;
5097       if (outside_overhead > 0)
5098         min_vec_niters = outside_overhead / saving_per_viter + 1;
5099
5100       if (dump_enabled_p ())
5101         dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
5102                      min_vec_niters);
5103
5104       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5105         {
5106           /* Now that we know the minimum number of vector iterations,
5107              find the minimum niters for which the scalar cost is larger:
5108
5109              SIC * niters > VIC * vniters + VOC - SOC
5110
5111              We know that the minimum niters is no more than
5112              vniters * VF + NPEEL, but it might be (and often is) less
5113              than that if a partial vector iteration is cheaper than the
5114              equivalent scalar code.  */
5115           int threshold = (vec_inside_cost * min_vec_niters
5116                            + vec_outside_cost
5117                            - scalar_outside_cost);
5118           if (threshold <= 0)
5119             min_profitable_iters = 1;
5120           else
5121             min_profitable_iters = threshold / scalar_single_iter_cost + 1;
5122         }
5123       else
5124         /* Convert the number of vector iterations into a number of
5125            scalar iterations.  */
5126         min_profitable_iters = (min_vec_niters * assumed_vf
5127                                 + peel_iters_prologue
5128                                 + peel_iters_epilogue);
5129     }
5130   else
5131     {
5132       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
5133                               * assumed_vf
5134                               - vec_inside_cost * peel_iters_prologue
5135                               - vec_inside_cost * peel_iters_epilogue);
5136       if (min_profitable_iters <= 0)
5137         min_profitable_iters = 0;
5138       else
5139         {
5140           min_profitable_iters /= saving_per_viter;
5141
5142           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
5143               <= (((int) vec_inside_cost * min_profitable_iters)
5144                   + (((int) vec_outside_cost - scalar_outside_cost)
5145                      * assumed_vf)))
5146             min_profitable_iters++;
5147         }
5148     }
5149
5150   if (dump_enabled_p ())
5151     dump_printf (MSG_NOTE,
5152                  "  Calculated minimum iters for profitability: %d\n",
5153                  min_profitable_iters);
5154
5155   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
5156       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
5157     /* We want the vectorized loop to execute at least once.  */
5158     min_profitable_iters = assumed_vf + peel_iters_prologue;
5159   else if (min_profitable_iters < peel_iters_prologue)
5160     /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
5161        vectorized loop executes at least once.  */
5162     min_profitable_iters = peel_iters_prologue;
5163
5164   if (dump_enabled_p ())
5165     dump_printf_loc (MSG_NOTE, vect_location,
5166                      "  Runtime profitability threshold = %d\n",
5167                      min_profitable_iters);
5168
5169   *ret_min_profitable_niters = min_profitable_iters;
5170
5171   /* Calculate number of iterations required to make the vector version
5172      profitable, relative to the loop bodies only.
5173
5174      Non-vectorized variant is SIC * niters and it must win over vector
5175      variant on the expected loop trip count.  The following condition must hold true:
5176      SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
5177
5178   if (vec_outside_cost <= 0)
5179     min_profitable_estimate = 0;
5180   /* ??? This "else if" arm is written to handle all cases; see below for
5181      what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
5182   else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5183     {
5184       /* This is a repeat of the code above, but with + SOC rather
5185          than - SOC.  */
5186       int outside_overhead = (vec_outside_cost
5187                               - scalar_single_iter_cost * peel_iters_prologue
5188                               - scalar_single_iter_cost * peel_iters_epilogue
5189                               + scalar_outside_cost);
5190       int min_vec_niters = 1;
5191       if (outside_overhead > 0)
5192         min_vec_niters = outside_overhead / saving_per_viter + 1;
5193
5194       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5195         {
5196           int threshold = (vec_inside_cost * min_vec_niters
5197                            + vec_outside_cost
5198                            + scalar_outside_cost);
5199           min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
5200         }
5201       else
5202         min_profitable_estimate = (min_vec_niters * assumed_vf
5203                                    + peel_iters_prologue
5204                                    + peel_iters_epilogue);
5205     }
5206   else
5207     {
5208       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
5209                                  * assumed_vf
5210                                  - vec_inside_cost * peel_iters_prologue
5211                                  - vec_inside_cost * peel_iters_epilogue)
5212                                  / ((scalar_single_iter_cost * assumed_vf)
5213                                    - vec_inside_cost);
5214     }
5215   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
5216   if (dump_enabled_p ())
5217     dump_printf_loc (MSG_NOTE, vect_location,
5218                      "  Static estimate profitability threshold = %d\n",
5219                      min_profitable_estimate);
5220
5221   *ret_min_profitable_estimate = min_profitable_estimate;
5222 }
5223
5224 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
5225    vector elements (not bits) for a vector with NELT elements.  */
5226 static void
5227 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
5228                               vec_perm_builder *sel)
5229 {
5230   /* The encoding is a single stepped pattern.  Any wrap-around is handled
5231      by vec_perm_indices.  */
5232   sel->new_vector (nelt, 1, 3);
5233   for (unsigned int i = 0; i < 3; i++)
5234     sel->quick_push (i + offset);
5235 }
5236
5237 /* Checks whether the target supports whole-vector shifts for vectors of mode
5238    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
5239    it supports vec_perm_const with masks for all necessary shift amounts.  */
5240 static bool
5241 have_whole_vector_shift (machine_mode mode)
5242 {
5243   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
5244     return true;
5245
5246   /* Variable-length vectors should be handled via the optab.  */
5247   unsigned int nelt;
5248   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5249     return false;
5250
5251   vec_perm_builder sel;
5252   vec_perm_indices indices;
5253   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
5254     {
5255       calc_vec_perm_mask_for_shift (i, nelt, &sel);
5256       indices.new_vector (sel, 2, nelt);
5257       if (!can_vec_perm_const_p (mode, mode, indices, false))
5258         return false;
5259     }
5260   return true;
5261 }
5262
5263 /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
5264    multiplication operands have differing signs and (b) we intend
5265    to emulate the operation using a series of signed DOT_PROD_EXPRs.
5266    See vect_emulate_mixed_dot_prod for the actual sequence used.  */
5267
5268 static bool
5269 vect_is_emulated_mixed_dot_prod (loop_vec_info loop_vinfo,
5270                                  stmt_vec_info stmt_info)
5271 {
5272   gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
5273   if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
5274     return false;
5275
5276   tree rhs1 = gimple_assign_rhs1 (assign);
5277   tree rhs2 = gimple_assign_rhs2 (assign);
5278   if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
5279     return false;
5280
5281   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5282   gcc_assert (reduc_info->is_reduc_info);
5283   return !directly_supported_p (DOT_PROD_EXPR,
5284                                 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info),
5285                                 optab_vector_mixed_sign);
5286 }
5287
5288 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
5289    functions. Design better to avoid maintenance issues.  */
5290
5291 /* Function vect_model_reduction_cost.
5292
5293    Models cost for a reduction operation, including the vector ops
5294    generated within the strip-mine loop in some cases, the initial
5295    definition before the loop, and the epilogue code that must be generated.  */
5296
5297 static void
5298 vect_model_reduction_cost (loop_vec_info loop_vinfo,
5299                            stmt_vec_info stmt_info, internal_fn reduc_fn,
5300                            vect_reduction_type reduction_type,
5301                            int ncopies, stmt_vector_for_cost *cost_vec)
5302 {
5303   int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
5304   tree vectype;
5305   machine_mode mode;
5306   class loop *loop = NULL;
5307
5308   if (loop_vinfo)
5309     loop = LOOP_VINFO_LOOP (loop_vinfo);
5310
5311   /* Condition reductions generate two reductions in the loop.  */
5312   if (reduction_type == COND_REDUCTION)
5313     ncopies *= 2;
5314
5315   vectype = STMT_VINFO_VECTYPE (stmt_info);
5316   mode = TYPE_MODE (vectype);
5317   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5318
5319   gimple_match_op op;
5320   if (!gimple_extract_op (orig_stmt_info->stmt, &op))
5321     gcc_unreachable ();
5322
5323   bool emulated_mixed_dot_prod
5324     = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
5325   if (reduction_type == EXTRACT_LAST_REDUCTION)
5326     /* No extra instructions are needed in the prologue.  The loop body
5327        operations are costed in vectorizable_condition.  */
5328     inside_cost = 0;
5329   else if (reduction_type == FOLD_LEFT_REDUCTION)
5330     {
5331       /* No extra instructions needed in the prologue.  */
5332       prologue_cost = 0;
5333
5334       if (reduc_fn != IFN_LAST)
5335         /* Count one reduction-like operation per vector.  */
5336         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
5337                                         stmt_info, 0, vect_body);
5338       else
5339         {
5340           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
5341           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
5342           inside_cost = record_stmt_cost (cost_vec, nelements,
5343                                           vec_to_scalar, stmt_info, 0,
5344                                           vect_body);
5345           inside_cost += record_stmt_cost (cost_vec, nelements,
5346                                            scalar_stmt, stmt_info, 0,
5347                                            vect_body);
5348         }
5349     }
5350   else
5351     {
5352       /* Add in the cost of the initial definitions.  */
5353       int prologue_stmts;
5354       if (reduction_type == COND_REDUCTION)
5355         /* For cond reductions we have four vectors: initial index, step,
5356            initial result of the data reduction, initial value of the index
5357            reduction.  */
5358         prologue_stmts = 4;
5359       else if (emulated_mixed_dot_prod)
5360         /* We need the initial reduction value and two invariants:
5361            one that contains the minimum signed value and one that
5362            contains half of its negative.  */
5363         prologue_stmts = 3;
5364       else
5365         prologue_stmts = 1;
5366       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
5367                                          scalar_to_vec, stmt_info, 0,
5368                                          vect_prologue);
5369     }
5370
5371   /* Determine cost of epilogue code.
5372
5373      We have a reduction operator that will reduce the vector in one statement.
5374      Also requires scalar extract.  */
5375
5376   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
5377     {
5378       if (reduc_fn != IFN_LAST)
5379         {
5380           if (reduction_type == COND_REDUCTION)
5381             {
5382               /* An EQ stmt and an COND_EXPR stmt.  */
5383               epilogue_cost += record_stmt_cost (cost_vec, 2,
5384                                                  vector_stmt, stmt_info, 0,
5385                                                  vect_epilogue);
5386               /* Reduction of the max index and a reduction of the found
5387                  values.  */
5388               epilogue_cost += record_stmt_cost (cost_vec, 2,
5389                                                  vec_to_scalar, stmt_info, 0,
5390                                                  vect_epilogue);
5391               /* A broadcast of the max value.  */
5392               epilogue_cost += record_stmt_cost (cost_vec, 1,
5393                                                  scalar_to_vec, stmt_info, 0,
5394                                                  vect_epilogue);
5395             }
5396           else
5397             {
5398               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
5399                                                  stmt_info, 0, vect_epilogue);
5400               epilogue_cost += record_stmt_cost (cost_vec, 1,
5401                                                  vec_to_scalar, stmt_info, 0,
5402                                                  vect_epilogue);
5403             }
5404         }
5405       else if (reduction_type == COND_REDUCTION)
5406         {
5407           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
5408           /* Extraction of scalar elements.  */
5409           epilogue_cost += record_stmt_cost (cost_vec,
5410                                              2 * estimated_nunits,
5411                                              vec_to_scalar, stmt_info, 0,
5412                                              vect_epilogue);
5413           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
5414           epilogue_cost += record_stmt_cost (cost_vec,
5415                                              2 * estimated_nunits - 3,
5416                                              scalar_stmt, stmt_info, 0,
5417                                              vect_epilogue);
5418         }
5419       else if (reduction_type == EXTRACT_LAST_REDUCTION
5420                || reduction_type == FOLD_LEFT_REDUCTION)
5421         /* No extra instructions need in the epilogue.  */
5422         ;
5423       else
5424         {
5425           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5426           tree bitsize = TYPE_SIZE (op.type);
5427           int element_bitsize = tree_to_uhwi (bitsize);
5428           int nelements = vec_size_in_bits / element_bitsize;
5429
5430           if (op.code == COND_EXPR)
5431             op.code = MAX_EXPR;
5432
5433           /* We have a whole vector shift available.  */
5434           if (VECTOR_MODE_P (mode)
5435               && directly_supported_p (op.code, vectype)
5436               && have_whole_vector_shift (mode))
5437             {
5438               /* Final reduction via vector shifts and the reduction operator.
5439                  Also requires scalar extract.  */
5440               epilogue_cost += record_stmt_cost (cost_vec,
5441                                                  exact_log2 (nelements) * 2,
5442                                                  vector_stmt, stmt_info, 0,
5443                                                  vect_epilogue);
5444               epilogue_cost += record_stmt_cost (cost_vec, 1,
5445                                                  vec_to_scalar, stmt_info, 0,
5446                                                  vect_epilogue);
5447             }
5448           else
5449             /* Use extracts and reduction op for final reduction.  For N
5450                elements, we have N extracts and N-1 reduction ops.  */
5451             epilogue_cost += record_stmt_cost (cost_vec,
5452                                                nelements + nelements - 1,
5453                                                vector_stmt, stmt_info, 0,
5454                                                vect_epilogue);
5455         }
5456     }
5457
5458   if (dump_enabled_p ())
5459     dump_printf (MSG_NOTE,
5460                  "vect_model_reduction_cost: inside_cost = %d, "
5461                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
5462                  prologue_cost, epilogue_cost);
5463 }
5464
5465 /* SEQ is a sequence of instructions that initialize the reduction
5466    described by REDUC_INFO.  Emit them in the appropriate place.  */
5467
5468 static void
5469 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
5470                                 stmt_vec_info reduc_info, gimple *seq)
5471 {
5472   if (reduc_info->reused_accumulator)
5473     {
5474       /* When reusing an accumulator from the main loop, we only need
5475          initialization instructions if the main loop can be skipped.
5476          In that case, emit the initialization instructions at the end
5477          of the guard block that does the skip.  */
5478       edge skip_edge = loop_vinfo->skip_main_loop_edge;
5479       gcc_assert (skip_edge);
5480       gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
5481       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
5482     }
5483   else
5484     {
5485       /* The normal case: emit the initialization instructions on the
5486          preheader edge.  */
5487       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5488       gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
5489     }
5490 }
5491
5492 /* Function get_initial_def_for_reduction
5493
5494    Input:
5495    REDUC_INFO - the info_for_reduction
5496    INIT_VAL - the initial value of the reduction variable
5497    NEUTRAL_OP - a value that has no effect on the reduction, as per
5498                 neutral_op_for_reduction
5499
5500    Output:
5501    Return a vector variable, initialized according to the operation that
5502         STMT_VINFO performs. This vector will be used as the initial value
5503         of the vector of partial results.
5504
5505    The value we need is a vector in which element 0 has value INIT_VAL
5506    and every other element has value NEUTRAL_OP.  */
5507
5508 static tree
5509 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
5510                                stmt_vec_info reduc_info,
5511                                tree init_val, tree neutral_op)
5512 {
5513   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5514   tree scalar_type = TREE_TYPE (init_val);
5515   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
5516   tree init_def;
5517   gimple_seq stmts = NULL;
5518
5519   gcc_assert (vectype);
5520
5521   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
5522               || SCALAR_FLOAT_TYPE_P (scalar_type));
5523
5524   gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
5525               || loop == (gimple_bb (reduc_info->stmt))->loop_father);
5526
5527   if (operand_equal_p (init_val, neutral_op))
5528     {
5529       /* If both elements are equal then the vector described above is
5530          just a splat.  */
5531       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5532       init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
5533     }
5534   else
5535     {
5536       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5537       init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
5538       if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
5539         {
5540           /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
5541              element 0.  */
5542           init_def = gimple_build_vector_from_val (&stmts, vectype,
5543                                                    neutral_op);
5544           init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
5545                                    vectype, init_def, init_val);
5546         }
5547       else
5548         {
5549           /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}.  */
5550           tree_vector_builder elts (vectype, 1, 2);
5551           elts.quick_push (init_val);
5552           elts.quick_push (neutral_op);
5553           init_def = gimple_build_vector (&stmts, &elts);
5554         }
5555     }
5556
5557   if (stmts)
5558     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
5559   return init_def;
5560 }
5561
5562 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
5563    which performs a reduction involving GROUP_SIZE scalar statements.
5564    NUMBER_OF_VECTORS is the number of vector defs to create.  If NEUTRAL_OP
5565    is nonnull, introducing extra elements of that value will not change the
5566    result.  */
5567
5568 static void
5569 get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
5570                                 stmt_vec_info reduc_info,
5571                                 vec<tree> *vec_oprnds,
5572                                 unsigned int number_of_vectors,
5573                                 unsigned int group_size, tree neutral_op)
5574 {
5575   vec<tree> &initial_values = reduc_info->reduc_initial_values;
5576   unsigned HOST_WIDE_INT nunits;
5577   unsigned j, number_of_places_left_in_vector;
5578   tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
5579   unsigned int i;
5580
5581   gcc_assert (group_size == initial_values.length () || neutral_op);
5582
5583   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5584      created vectors. It is greater than 1 if unrolling is performed.
5585
5586      For example, we have two scalar operands, s1 and s2 (e.g., group of
5587      strided accesses of size two), while NUNITS is four (i.e., four scalars
5588      of this type can be packed in a vector).  The output vector will contain
5589      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
5590      will be 2).
5591
5592      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
5593      vectors containing the operands.
5594
5595      For example, NUNITS is four as before, and the group size is 8
5596      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
5597      {s5, s6, s7, s8}.  */
5598
5599   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
5600     nunits = group_size;
5601
5602   number_of_places_left_in_vector = nunits;
5603   bool constant_p = true;
5604   tree_vector_builder elts (vector_type, nunits, 1);
5605   elts.quick_grow (nunits);
5606   gimple_seq ctor_seq = NULL;
5607   for (j = 0; j < nunits * number_of_vectors; ++j)
5608     {
5609       tree op;
5610       i = j % group_size;
5611
5612       /* Get the def before the loop.  In reduction chain we have only
5613          one initial value.  Else we have as many as PHIs in the group.  */
5614       if (i >= initial_values.length () || (j > i && neutral_op))
5615         op = neutral_op;
5616       else
5617         op = initial_values[i];
5618
5619       /* Create 'vect_ = {op0,op1,...,opn}'.  */
5620       number_of_places_left_in_vector--;
5621       elts[nunits - number_of_places_left_in_vector - 1] = op;
5622       if (!CONSTANT_CLASS_P (op))
5623         constant_p = false;
5624
5625       if (number_of_places_left_in_vector == 0)
5626         {
5627           tree init;
5628           if (constant_p && !neutral_op
5629               ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
5630               : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
5631             /* Build the vector directly from ELTS.  */
5632             init = gimple_build_vector (&ctor_seq, &elts);
5633           else if (neutral_op)
5634             {
5635               /* Build a vector of the neutral value and shift the
5636                  other elements into place.  */
5637               init = gimple_build_vector_from_val (&ctor_seq, vector_type,
5638                                                    neutral_op);
5639               int k = nunits;
5640               while (k > 0 && elts[k - 1] == neutral_op)
5641                 k -= 1;
5642               while (k > 0)
5643                 {
5644                   k -= 1;
5645                   init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
5646                                        vector_type, init, elts[k]);
5647                 }
5648             }
5649           else
5650             {
5651               /* First time round, duplicate ELTS to fill the
5652                  required number of vectors.  */
5653               duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
5654                                         elts, number_of_vectors, *vec_oprnds);
5655               break;
5656             }
5657           vec_oprnds->quick_push (init);
5658
5659           number_of_places_left_in_vector = nunits;
5660           elts.new_vector (vector_type, nunits, 1);
5661           elts.quick_grow (nunits);
5662           constant_p = true;
5663         }
5664     }
5665   if (ctor_seq != NULL)
5666     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
5667 }
5668
5669 /* For a statement STMT_INFO taking part in a reduction operation return
5670    the stmt_vec_info the meta information is stored on.  */
5671
5672 stmt_vec_info
5673 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
5674 {
5675   stmt_info = vect_orig_stmt (stmt_info);
5676   gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
5677   if (!is_a <gphi *> (stmt_info->stmt)
5678       || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
5679     stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5680   gphi *phi = as_a <gphi *> (stmt_info->stmt);
5681   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5682     {
5683       if (gimple_phi_num_args (phi) == 1)
5684         stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5685     }
5686   else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
5687     {
5688       stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
5689       if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
5690         stmt_info = info;
5691     }
5692   return stmt_info;
5693 }
5694
5695 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
5696    REDUC_INFO can build on.  Adjust REDUC_INFO and return true if so, otherwise
5697    return false.  */
5698
5699 static bool
5700 vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
5701                                 stmt_vec_info reduc_info)
5702 {
5703   loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
5704   if (!main_loop_vinfo)
5705     return false;
5706
5707   if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
5708     return false;
5709
5710   unsigned int num_phis = reduc_info->reduc_initial_values.length ();
5711   auto_vec<tree, 16> main_loop_results (num_phis);
5712   auto_vec<tree, 16> initial_values (num_phis);
5713   if (edge main_loop_edge = loop_vinfo->main_loop_edge)
5714     {
5715       /* The epilogue loop can be entered either from the main loop or
5716          from an earlier guard block.  */
5717       edge skip_edge = loop_vinfo->skip_main_loop_edge;
5718       for (tree incoming_value : reduc_info->reduc_initial_values)
5719         {
5720           /* Look for:
5721
5722                INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
5723                                     INITIAL_VALUE(guard block)>.  */
5724           gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
5725
5726           gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
5727           gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
5728
5729           tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
5730           tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
5731
5732           main_loop_results.quick_push (from_main_loop);
5733           initial_values.quick_push (from_skip);
5734         }
5735     }
5736   else
5737     /* The main loop dominates the epilogue loop.  */
5738     main_loop_results.splice (reduc_info->reduc_initial_values);
5739
5740   /* See if the main loop has the kind of accumulator we need.  */
5741   vect_reusable_accumulator *accumulator
5742     = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
5743   if (!accumulator
5744       || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
5745       || !std::equal (main_loop_results.begin (), main_loop_results.end (),
5746                       accumulator->reduc_info->reduc_scalar_results.begin ()))
5747     return false;
5748
5749   /* Handle the case where we can reduce wider vectors to narrower ones.  */
5750   tree vectype = STMT_VINFO_VECTYPE (reduc_info);
5751   tree old_vectype = TREE_TYPE (accumulator->reduc_input);
5752   unsigned HOST_WIDE_INT m;
5753   if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
5754                             TYPE_VECTOR_SUBPARTS (vectype), &m))
5755     return false;
5756   /* Check the intermediate vector types and operations are available.  */
5757   tree prev_vectype = old_vectype;
5758   poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
5759   while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
5760     {
5761       intermediate_nunits = exact_div (intermediate_nunits, 2);
5762       tree intermediate_vectype = get_related_vectype_for_scalar_type
5763         (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
5764       if (!intermediate_vectype
5765           || !directly_supported_p (STMT_VINFO_REDUC_CODE (reduc_info),
5766                                     intermediate_vectype)
5767           || !can_vec_extract (TYPE_MODE (prev_vectype),
5768                                TYPE_MODE (intermediate_vectype)))
5769         return false;
5770       prev_vectype = intermediate_vectype;
5771     }
5772
5773   /* Non-SLP reductions might apply an adjustment after the reduction
5774      operation, in order to simplify the initialization of the accumulator.
5775      If the epilogue loop carries on from where the main loop left off,
5776      it should apply the same adjustment to the final reduction result.
5777
5778      If the epilogue loop can also be entered directly (rather than via
5779      the main loop), we need to be able to handle that case in the same way,
5780      with the same adjustment.  (In principle we could add a PHI node
5781      to select the correct adjustment, but in practice that shouldn't be
5782      necessary.)  */
5783   tree main_adjustment
5784     = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5785   if (loop_vinfo->main_loop_edge && main_adjustment)
5786     {
5787       gcc_assert (num_phis == 1);
5788       tree initial_value = initial_values[0];
5789       /* Check that we can use INITIAL_VALUE as the adjustment and
5790          initialize the accumulator with a neutral value instead.  */
5791       if (!operand_equal_p (initial_value, main_adjustment))
5792         return false;
5793       code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5794       initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
5795                                                     code, initial_value);
5796     }
5797   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5798   reduc_info->reduc_initial_values.truncate (0);
5799   reduc_info->reduc_initial_values.splice (initial_values);
5800   reduc_info->reused_accumulator = accumulator;
5801   return true;
5802 }
5803
5804 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5805    CODE emitting stmts before GSI.  Returns a vector def of VECTYPE.  */
5806
5807 static tree
5808 vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5809                             gimple_seq *seq)
5810 {
5811   unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5812   unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5813   tree stype = TREE_TYPE (vectype);
5814   tree new_temp = vec_def;
5815   while (nunits > nunits1)
5816     {
5817       nunits /= 2;
5818       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5819                                                            stype, nunits);
5820       unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5821
5822       /* The target has to make sure we support lowpart/highpart
5823          extraction, either via direct vector extract or through
5824          an integer mode punning.  */
5825       tree dst1, dst2;
5826       gimple *epilog_stmt;
5827       if (convert_optab_handler (vec_extract_optab,
5828                                  TYPE_MODE (TREE_TYPE (new_temp)),
5829                                  TYPE_MODE (vectype1))
5830           != CODE_FOR_nothing)
5831         {
5832           /* Extract sub-vectors directly once vec_extract becomes
5833              a conversion optab.  */
5834           dst1 = make_ssa_name (vectype1);
5835           epilog_stmt
5836               = gimple_build_assign (dst1, BIT_FIELD_REF,
5837                                      build3 (BIT_FIELD_REF, vectype1,
5838                                              new_temp, TYPE_SIZE (vectype1),
5839                                              bitsize_int (0)));
5840           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5841           dst2 =  make_ssa_name (vectype1);
5842           epilog_stmt
5843               = gimple_build_assign (dst2, BIT_FIELD_REF,
5844                                      build3 (BIT_FIELD_REF, vectype1,
5845                                              new_temp, TYPE_SIZE (vectype1),
5846                                              bitsize_int (bitsize)));
5847           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5848         }
5849       else
5850         {
5851           /* Extract via punning to appropriately sized integer mode
5852              vector.  */
5853           tree eltype = build_nonstandard_integer_type (bitsize, 1);
5854           tree etype = build_vector_type (eltype, 2);
5855           gcc_assert (convert_optab_handler (vec_extract_optab,
5856                                              TYPE_MODE (etype),
5857                                              TYPE_MODE (eltype))
5858                       != CODE_FOR_nothing);
5859           tree tem = make_ssa_name (etype);
5860           epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5861                                              build1 (VIEW_CONVERT_EXPR,
5862                                                      etype, new_temp));
5863           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5864           new_temp = tem;
5865           tem = make_ssa_name (eltype);
5866           epilog_stmt
5867               = gimple_build_assign (tem, BIT_FIELD_REF,
5868                                      build3 (BIT_FIELD_REF, eltype,
5869                                              new_temp, TYPE_SIZE (eltype),
5870                                              bitsize_int (0)));
5871           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5872           dst1 = make_ssa_name (vectype1);
5873           epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5874                                              build1 (VIEW_CONVERT_EXPR,
5875                                                      vectype1, tem));
5876           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5877           tem = make_ssa_name (eltype);
5878           epilog_stmt
5879               = gimple_build_assign (tem, BIT_FIELD_REF,
5880                                      build3 (BIT_FIELD_REF, eltype,
5881                                              new_temp, TYPE_SIZE (eltype),
5882                                              bitsize_int (bitsize)));
5883           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5884           dst2 =  make_ssa_name (vectype1);
5885           epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5886                                              build1 (VIEW_CONVERT_EXPR,
5887                                                      vectype1, tem));
5888           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5889         }
5890
5891       new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5892     }
5893
5894   return new_temp;
5895 }
5896
5897 /* Retrieves the definining statement to be used for a reduction.
5898    For MAIN_EXIT_P we use the current VEC_STMTs and otherwise we look at
5899    the reduction definitions.  */
5900
5901 tree
5902 vect_get_vect_def (stmt_vec_info reduc_info, slp_tree slp_node,
5903                    slp_instance slp_node_instance, bool main_exit_p, unsigned i,
5904                    vec <gimple *> &vec_stmts)
5905 {
5906   tree def;
5907
5908   if (slp_node)
5909     {
5910       if (!main_exit_p)
5911         slp_node = slp_node_instance->reduc_phis;
5912       def = vect_get_slp_vect_def (slp_node, i);
5913     }
5914   else
5915     {
5916       if (!main_exit_p)
5917         reduc_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (reduc_info));
5918       vec_stmts = STMT_VINFO_VEC_STMTS (reduc_info);
5919       def = gimple_get_lhs (vec_stmts[0]);
5920     }
5921
5922   return def;
5923 }
5924
5925 /* Function vect_create_epilog_for_reduction
5926
5927    Create code at the loop-epilog to finalize the result of a reduction
5928    computation.
5929
5930    STMT_INFO is the scalar reduction stmt that is being vectorized.
5931    SLP_NODE is an SLP node containing a group of reduction statements. The
5932      first one in this group is STMT_INFO.
5933    SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5934    REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5935      (counting from 0)
5936    LOOP_EXIT is the edge to update in the merge block.  In the case of a single
5937      exit this edge is always the main loop exit.
5938
5939    This function:
5940    1. Completes the reduction def-use cycles.
5941    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5942       by calling the function specified by REDUC_FN if available, or by
5943       other means (whole-vector shifts or a scalar loop).
5944       The function also creates a new phi node at the loop exit to preserve
5945       loop-closed form, as illustrated below.
5946
5947      The flow at the entry to this function:
5948
5949         loop:
5950           vec_def = phi <vec_init, null>        # REDUCTION_PHI
5951           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5952           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5953         loop_exit:
5954           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5955           use <s_out0>
5956           use <s_out0>
5957
5958      The above is transformed by this function into:
5959
5960         loop:
5961           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
5962           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5963           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5964         loop_exit:
5965           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5966           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5967           v_out2 = reduce <v_out1>
5968           s_out3 = extract_field <v_out2, 0>
5969           s_out4 = adjust_result <s_out3>
5970           use <s_out4>
5971           use <s_out4>
5972 */
5973
5974 static void
5975 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5976                                   stmt_vec_info stmt_info,
5977                                   slp_tree slp_node,
5978                                   slp_instance slp_node_instance,
5979                                   edge loop_exit)
5980 {
5981   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5982   gcc_assert (reduc_info->is_reduc_info);
5983   /* For double reductions we need to get at the inner loop reduction
5984      stmt which has the meta info attached.  Our stmt_info is that of the
5985      loop-closed PHI of the inner loop which we remember as
5986      def for the reduction PHI generation.  */
5987   bool double_reduc = false;
5988   bool main_exit_p = LOOP_VINFO_IV_EXIT (loop_vinfo) == loop_exit;
5989   stmt_vec_info rdef_info = stmt_info;
5990   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5991     {
5992       gcc_assert (!slp_node);
5993       double_reduc = true;
5994       stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5995                                             (stmt_info->stmt, 0));
5996       stmt_info = vect_stmt_to_vectorize (stmt_info);
5997     }
5998   gphi *reduc_def_stmt
5999     = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
6000   code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
6001   internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
6002   tree vectype;
6003   machine_mode mode;
6004   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
6005   basic_block exit_bb;
6006   tree scalar_dest;
6007   tree scalar_type;
6008   gimple *new_phi = NULL, *phi = NULL;
6009   gimple_stmt_iterator exit_gsi;
6010   tree new_temp = NULL_TREE, new_name, new_scalar_dest;
6011   gimple *epilog_stmt = NULL;
6012   gimple *exit_phi;
6013   tree bitsize;
6014   tree def;
6015   tree orig_name, scalar_result;
6016   imm_use_iterator imm_iter, phi_imm_iter;
6017   use_operand_p use_p, phi_use_p;
6018   gimple *use_stmt;
6019   auto_vec<tree> reduc_inputs;
6020   int j, i;
6021   vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
6022   unsigned int group_size = 1, k;
6023   auto_vec<gimple *> phis;
6024   /* SLP reduction without reduction chain, e.g.,
6025      # a1 = phi <a2, a0>
6026      # b1 = phi <b2, b0>
6027      a2 = operation (a1)
6028      b2 = operation (b1)  */
6029   bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
6030   bool direct_slp_reduc;
6031   tree induction_index = NULL_TREE;
6032
6033   if (slp_node)
6034     group_size = SLP_TREE_LANES (slp_node);
6035
6036   if (nested_in_vect_loop_p (loop, stmt_info))
6037     {
6038       outer_loop = loop;
6039       loop = loop->inner;
6040       gcc_assert (!slp_node && double_reduc);
6041     }
6042
6043   vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
6044   gcc_assert (vectype);
6045   mode = TYPE_MODE (vectype);
6046
6047   tree induc_val = NULL_TREE;
6048   tree adjustment_def = NULL;
6049   if (slp_node)
6050     ;
6051   else
6052     {
6053       /* Optimize: for induction condition reduction, if we can't use zero
6054          for induc_val, use initial_def.  */
6055       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6056         induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
6057       else if (double_reduc)
6058         ;
6059       else
6060         adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
6061     }
6062
6063   stmt_vec_info single_live_out_stmt[] = { stmt_info };
6064   array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
6065   if (slp_reduc)
6066     /* All statements produce live-out values.  */
6067     live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
6068   else if (slp_node)
6069     {
6070       /* The last statement in the reduction chain produces the live-out
6071          value.  Note SLP optimization can shuffle scalar stmts to
6072          optimize permutations so we have to search for the last stmt.  */
6073       for (k = 0; k < group_size; ++k)
6074         if (!REDUC_GROUP_NEXT_ELEMENT (SLP_TREE_SCALAR_STMTS (slp_node)[k]))
6075           {
6076             single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[k];
6077             break;
6078           }
6079     }
6080
6081   unsigned vec_num;
6082   int ncopies;
6083   if (slp_node)
6084     {
6085       vec_num = SLP_TREE_VEC_DEFS (slp_node_instance->reduc_phis).length ();
6086       ncopies = 1;
6087     }
6088   else
6089     {
6090       stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
6091       vec_num = 1;
6092       ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
6093     }
6094
6095   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
6096      which is updated with the current index of the loop for every match of
6097      the original loop's cond_expr (VEC_STMT).  This results in a vector
6098      containing the last time the condition passed for that vector lane.
6099      The first match will be a 1 to allow 0 to be used for non-matching
6100      indexes.  If there are no matches at all then the vector will be all
6101      zeroes.
6102
6103      PR92772: This algorithm is broken for architectures that support
6104      masked vectors, but do not provide fold_extract_last.  */
6105   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
6106     {
6107       auto_vec<std::pair<tree, bool>, 2> ccompares;
6108       stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
6109       cond_info = vect_stmt_to_vectorize (cond_info);
6110       while (cond_info != reduc_info)
6111         {
6112           if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
6113             {
6114               gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
6115               gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
6116               ccompares.safe_push
6117                 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
6118                                  STMT_VINFO_REDUC_IDX (cond_info) == 2));
6119             }
6120           cond_info
6121             = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
6122                                                  1 + STMT_VINFO_REDUC_IDX
6123                                                         (cond_info)));
6124           cond_info = vect_stmt_to_vectorize (cond_info);
6125         }
6126       gcc_assert (ccompares.length () != 0);
6127
6128       tree indx_before_incr, indx_after_incr;
6129       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
6130       int scalar_precision
6131         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
6132       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
6133       tree cr_index_vector_type = get_related_vectype_for_scalar_type
6134         (TYPE_MODE (vectype), cr_index_scalar_type,
6135          TYPE_VECTOR_SUBPARTS (vectype));
6136
6137       /* First we create a simple vector induction variable which starts
6138          with the values {1,2,3,...} (SERIES_VECT) and increments by the
6139          vector size (STEP).  */
6140
6141       /* Create a {1,2,3,...} vector.  */
6142       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
6143
6144       /* Create a vector of the step value.  */
6145       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
6146       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
6147
6148       /* Create an induction variable.  */
6149       gimple_stmt_iterator incr_gsi;
6150       bool insert_after;
6151       vect_iv_increment_position (loop_exit, &incr_gsi, &insert_after);
6152       create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, &incr_gsi,
6153                  insert_after, &indx_before_incr, &indx_after_incr);
6154
6155       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
6156          filled with zeros (VEC_ZERO).  */
6157
6158       /* Create a vector of 0s.  */
6159       tree zero = build_zero_cst (cr_index_scalar_type);
6160       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
6161
6162       /* Create a vector phi node.  */
6163       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
6164       new_phi = create_phi_node (new_phi_tree, loop->header);
6165       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
6166                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
6167
6168       /* Now take the condition from the loops original cond_exprs
6169          and produce a new cond_exprs (INDEX_COND_EXPR) which for
6170          every match uses values from the induction variable
6171          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
6172          (NEW_PHI_TREE).
6173          Finally, we update the phi (NEW_PHI_TREE) to take the value of
6174          the new cond_expr (INDEX_COND_EXPR).  */
6175       gimple_seq stmts = NULL;
6176       for (int i = ccompares.length () - 1; i != -1; --i)
6177         {
6178           tree ccompare = ccompares[i].first;
6179           if (ccompares[i].second)
6180             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
6181                                          cr_index_vector_type,
6182                                          ccompare,
6183                                          indx_before_incr, new_phi_tree);
6184           else
6185             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
6186                                          cr_index_vector_type,
6187                                          ccompare,
6188                                          new_phi_tree, indx_before_incr);
6189         }
6190       gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
6191
6192       /* Update the phi with the vec cond.  */
6193       induction_index = new_phi_tree;
6194       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
6195                    loop_latch_edge (loop), UNKNOWN_LOCATION);
6196     }
6197
6198   /* 2. Create epilog code.
6199         The reduction epilog code operates across the elements of the vector
6200         of partial results computed by the vectorized loop.
6201         The reduction epilog code consists of:
6202
6203         step 1: compute the scalar result in a vector (v_out2)
6204         step 2: extract the scalar result (s_out3) from the vector (v_out2)
6205         step 3: adjust the scalar result (s_out3) if needed.
6206
6207         Step 1 can be accomplished using one the following three schemes:
6208           (scheme 1) using reduc_fn, if available.
6209           (scheme 2) using whole-vector shifts, if available.
6210           (scheme 3) using a scalar loop. In this case steps 1+2 above are
6211                      combined.
6212
6213           The overall epilog code looks like this:
6214
6215           s_out0 = phi <s_loop>         # original EXIT_PHI
6216           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
6217           v_out2 = reduce <v_out1>              # step 1
6218           s_out3 = extract_field <v_out2, 0>    # step 2
6219           s_out4 = adjust_result <s_out3>       # step 3
6220
6221           (step 3 is optional, and steps 1 and 2 may be combined).
6222           Lastly, the uses of s_out0 are replaced by s_out4.  */
6223
6224
6225   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
6226          v_out1 = phi <VECT_DEF>
6227          Store them in NEW_PHIS.  */
6228   if (double_reduc)
6229     loop = outer_loop;
6230   /* We need to reduce values in all exits.  */
6231   exit_bb = loop_exit->dest;
6232   exit_gsi = gsi_after_labels (exit_bb);
6233   reduc_inputs.create (slp_node ? vec_num : ncopies);
6234   vec <gimple *> vec_stmts = vNULL;
6235   for (unsigned i = 0; i < vec_num; i++)
6236     {
6237       gimple_seq stmts = NULL;
6238       def = vect_get_vect_def (rdef_info, slp_node, slp_node_instance,
6239                                main_exit_p, i, vec_stmts);
6240       for (j = 0; j < ncopies; j++)
6241         {
6242           tree new_def = copy_ssa_name (def);
6243           phi = create_phi_node (new_def, exit_bb);
6244           if (j)
6245             def = gimple_get_lhs (vec_stmts[j]);
6246           SET_PHI_ARG_DEF (phi, loop_exit->dest_idx, def);
6247           new_def = gimple_convert (&stmts, vectype, new_def);
6248           reduc_inputs.quick_push (new_def);
6249         }
6250       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6251     }
6252
6253   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
6254          (i.e. when reduc_fn is not available) and in the final adjustment
6255          code (if needed).  Also get the original scalar reduction variable as
6256          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
6257          represents a reduction pattern), the tree-code and scalar-def are
6258          taken from the original stmt that the pattern-stmt (STMT) replaces.
6259          Otherwise (it is a regular reduction) - the tree-code and scalar-def
6260          are taken from STMT.  */
6261
6262   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6263   if (orig_stmt_info != stmt_info)
6264     {
6265       /* Reduction pattern  */
6266       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6267       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
6268     }
6269
6270   scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
6271   scalar_type = TREE_TYPE (scalar_dest);
6272   scalar_results.truncate (0);
6273   scalar_results.reserve_exact (group_size);
6274   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
6275   bitsize = TYPE_SIZE (scalar_type);
6276
6277   /* True if we should implement SLP_REDUC using native reduction operations
6278      instead of scalar operations.  */
6279   direct_slp_reduc = (reduc_fn != IFN_LAST
6280                       && slp_reduc
6281                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
6282
6283   /* In case of reduction chain, e.g.,
6284      # a1 = phi <a3, a0>
6285      a2 = operation (a1)
6286      a3 = operation (a2),
6287
6288      we may end up with more than one vector result.  Here we reduce them
6289      to one vector.
6290
6291      The same is true for a SLP reduction, e.g.,
6292      # a1 = phi <a2, a0>
6293      # b1 = phi <b2, b0>
6294      a2 = operation (a1)
6295      b2 = operation (a2),
6296
6297      where we can end up with more than one vector as well.  We can
6298      easily accumulate vectors when the number of vector elements is
6299      a multiple of the SLP group size.
6300
6301      The same is true if we couldn't use a single defuse cycle.  */
6302   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6303       || direct_slp_reduc
6304       || (slp_reduc
6305           && constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype), group_size))
6306       || ncopies > 1)
6307     {
6308       gimple_seq stmts = NULL;
6309       tree single_input = reduc_inputs[0];
6310       for (k = 1; k < reduc_inputs.length (); k++)
6311         single_input = gimple_build (&stmts, code, vectype,
6312                                      single_input, reduc_inputs[k]);
6313       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6314
6315       reduc_inputs.truncate (0);
6316       reduc_inputs.safe_push (single_input);
6317     }
6318
6319   tree orig_reduc_input = reduc_inputs[0];
6320
6321   /* If this loop is an epilogue loop that can be skipped after the
6322      main loop, we can only share a reduction operation between the
6323      main loop and the epilogue if we put it at the target of the
6324      skip edge.
6325
6326      We can still reuse accumulators if this check fails.  Doing so has
6327      the minor(?) benefit of making the epilogue loop's scalar result
6328      independent of the main loop's scalar result.  */
6329   bool unify_with_main_loop_p = false;
6330   if (reduc_info->reused_accumulator
6331       && loop_vinfo->skip_this_loop_edge
6332       && single_succ_p (exit_bb)
6333       && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
6334     {
6335       unify_with_main_loop_p = true;
6336
6337       basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
6338       reduc_inputs[0] = make_ssa_name (vectype);
6339       gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
6340       add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
6341                    UNKNOWN_LOCATION);
6342       add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
6343                    loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
6344       exit_gsi = gsi_after_labels (reduc_block);
6345     }
6346
6347   /* Shouldn't be used beyond this point.  */
6348   exit_bb = nullptr;
6349
6350   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6351       && reduc_fn != IFN_LAST)
6352     {
6353       /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
6354          various data values where the condition matched and another vector
6355          (INDUCTION_INDEX) containing all the indexes of those matches.  We
6356          need to extract the last matching index (which will be the index with
6357          highest value) and use this to index into the data vector.
6358          For the case where there were no matches, the data vector will contain
6359          all default values and the index vector will be all zeros.  */
6360
6361       /* Get various versions of the type of the vector of indexes.  */
6362       tree index_vec_type = TREE_TYPE (induction_index);
6363       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
6364       tree index_scalar_type = TREE_TYPE (index_vec_type);
6365       tree index_vec_cmp_type = truth_type_for (index_vec_type);
6366
6367       /* Get an unsigned integer version of the type of the data vector.  */
6368       int scalar_precision
6369         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6370       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
6371       tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
6372                                                 vectype);
6373
6374       /* First we need to create a vector (ZERO_VEC) of zeros and another
6375          vector (MAX_INDEX_VEC) filled with the last matching index, which we
6376          can create using a MAX reduction and then expanding.
6377          In the case where the loop never made any matches, the max index will
6378          be zero.  */
6379
6380       /* Vector of {0, 0, 0,...}.  */
6381       tree zero_vec = build_zero_cst (vectype);
6382
6383       /* Find maximum value from the vector of found indexes.  */
6384       tree max_index = make_ssa_name (index_scalar_type);
6385       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6386                                                           1, induction_index);
6387       gimple_call_set_lhs (max_index_stmt, max_index);
6388       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
6389
6390       /* Vector of {max_index, max_index, max_index,...}.  */
6391       tree max_index_vec = make_ssa_name (index_vec_type);
6392       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
6393                                                       max_index);
6394       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
6395                                                         max_index_vec_rhs);
6396       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
6397
6398       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
6399          with the vector (INDUCTION_INDEX) of found indexes, choosing values
6400          from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
6401          otherwise.  Only one value should match, resulting in a vector
6402          (VEC_COND) with one data value and the rest zeros.
6403          In the case where the loop never made any matches, every index will
6404          match, resulting in a vector with all data values (which will all be
6405          the default value).  */
6406
6407       /* Compare the max index vector to the vector of found indexes to find
6408          the position of the max value.  */
6409       tree vec_compare = make_ssa_name (index_vec_cmp_type);
6410       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
6411                                                       induction_index,
6412                                                       max_index_vec);
6413       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
6414
6415       /* Use the compare to choose either values from the data vector or
6416          zero.  */
6417       tree vec_cond = make_ssa_name (vectype);
6418       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
6419                                                    vec_compare,
6420                                                    reduc_inputs[0],
6421                                                    zero_vec);
6422       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
6423
6424       /* Finally we need to extract the data value from the vector (VEC_COND)
6425          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
6426          reduction, but because this doesn't exist, we can use a MAX reduction
6427          instead.  The data value might be signed or a float so we need to cast
6428          it first.
6429          In the case where the loop never made any matches, the data values are
6430          all identical, and so will reduce down correctly.  */
6431
6432       /* Make the matched data values unsigned.  */
6433       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
6434       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
6435                                        vec_cond);
6436       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
6437                                                         VIEW_CONVERT_EXPR,
6438                                                         vec_cond_cast_rhs);
6439       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
6440
6441       /* Reduce down to a scalar value.  */
6442       tree data_reduc = make_ssa_name (scalar_type_unsigned);
6443       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6444                                                            1, vec_cond_cast);
6445       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
6446       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
6447
6448       /* Convert the reduced value back to the result type and set as the
6449          result.  */
6450       gimple_seq stmts = NULL;
6451       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
6452                                data_reduc);
6453       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6454       scalar_results.safe_push (new_temp);
6455     }
6456   else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6457            && reduc_fn == IFN_LAST)
6458     {
6459       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
6460          idx = 0;
6461          idx_val = induction_index[0];
6462          val = data_reduc[0];
6463          for (idx = 0, val = init, i = 0; i < nelts; ++i)
6464            if (induction_index[i] > idx_val)
6465              val = data_reduc[i], idx_val = induction_index[i];
6466          return val;  */
6467
6468       tree data_eltype = TREE_TYPE (vectype);
6469       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
6470       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
6471       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
6472       /* Enforced by vectorizable_reduction, which ensures we have target
6473          support before allowing a conditional reduction on variable-length
6474          vectors.  */
6475       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
6476       tree idx_val = NULL_TREE, val = NULL_TREE;
6477       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
6478         {
6479           tree old_idx_val = idx_val;
6480           tree old_val = val;
6481           idx_val = make_ssa_name (idx_eltype);
6482           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
6483                                              build3 (BIT_FIELD_REF, idx_eltype,
6484                                                      induction_index,
6485                                                      bitsize_int (el_size),
6486                                                      bitsize_int (off)));
6487           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6488           val = make_ssa_name (data_eltype);
6489           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
6490                                              build3 (BIT_FIELD_REF,
6491                                                      data_eltype,
6492                                                      reduc_inputs[0],
6493                                                      bitsize_int (el_size),
6494                                                      bitsize_int (off)));
6495           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6496           if (off != 0)
6497             {
6498               tree new_idx_val = idx_val;
6499               if (off != v_size - el_size)
6500                 {
6501                   new_idx_val = make_ssa_name (idx_eltype);
6502                   epilog_stmt = gimple_build_assign (new_idx_val,
6503                                                      MAX_EXPR, idx_val,
6504                                                      old_idx_val);
6505                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6506                 }
6507               tree cond = make_ssa_name (boolean_type_node);
6508               epilog_stmt = gimple_build_assign (cond, GT_EXPR,
6509                                                  idx_val, old_idx_val);
6510               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6511               tree new_val = make_ssa_name (data_eltype);
6512               epilog_stmt = gimple_build_assign (new_val, COND_EXPR,
6513                                                  cond, val, old_val);
6514               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6515               idx_val = new_idx_val;
6516               val = new_val;
6517             }
6518         }
6519       /* Convert the reduced value back to the result type and set as the
6520          result.  */
6521       gimple_seq stmts = NULL;
6522       val = gimple_convert (&stmts, scalar_type, val);
6523       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6524       scalar_results.safe_push (val);
6525     }
6526
6527   /* 2.3 Create the reduction code, using one of the three schemes described
6528          above. In SLP we simply need to extract all the elements from the
6529          vector (without reducing them), so we use scalar shifts.  */
6530   else if (reduc_fn != IFN_LAST && !slp_reduc)
6531     {
6532       tree tmp;
6533       tree vec_elem_type;
6534
6535       /* Case 1:  Create:
6536          v_out2 = reduc_expr <v_out1>  */
6537
6538       if (dump_enabled_p ())
6539         dump_printf_loc (MSG_NOTE, vect_location,
6540                          "Reduce using direct vector reduction.\n");
6541
6542       gimple_seq stmts = NULL;
6543       vec_elem_type = TREE_TYPE (vectype);
6544       new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
6545                                vec_elem_type, reduc_inputs[0]);
6546       new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6547       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6548
6549       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6550           && induc_val)
6551         {
6552           /* Earlier we set the initial value to be a vector if induc_val
6553              values.  Check the result and if it is induc_val then replace
6554              with the original initial value, unless induc_val is
6555              the same as initial_def already.  */
6556           tree zcompare = make_ssa_name (boolean_type_node);
6557           epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
6558                                              new_temp, induc_val);
6559           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6560           tree initial_def = reduc_info->reduc_initial_values[0];
6561           tmp = make_ssa_name (new_scalar_dest);
6562           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6563                                              initial_def, new_temp);
6564           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6565           new_temp = tmp;
6566         }
6567
6568       scalar_results.safe_push (new_temp);
6569     }
6570   else if (direct_slp_reduc)
6571     {
6572       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
6573          with the elements for other SLP statements replaced with the
6574          neutral value.  We can then do a normal reduction on each vector.  */
6575
6576       /* Enforced by vectorizable_reduction.  */
6577       gcc_assert (reduc_inputs.length () == 1);
6578       gcc_assert (pow2p_hwi (group_size));
6579
6580       gimple_seq seq = NULL;
6581
6582       /* Build a vector {0, 1, 2, ...}, with the same number of elements
6583          and the same element size as VECTYPE.  */
6584       tree index = build_index_vector (vectype, 0, 1);
6585       tree index_type = TREE_TYPE (index);
6586       tree index_elt_type = TREE_TYPE (index_type);
6587       tree mask_type = truth_type_for (index_type);
6588
6589       /* Create a vector that, for each element, identifies which of
6590          the REDUC_GROUP_SIZE results should use it.  */
6591       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
6592       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
6593                             build_vector_from_val (index_type, index_mask));
6594
6595       /* Get a neutral vector value.  This is simply a splat of the neutral
6596          scalar value if we have one, otherwise the initial scalar value
6597          is itself a neutral value.  */
6598       tree vector_identity = NULL_TREE;
6599       tree neutral_op = NULL_TREE;
6600       if (slp_node)
6601         {
6602           tree initial_value = NULL_TREE;
6603           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6604             initial_value = reduc_info->reduc_initial_values[0];
6605           neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
6606                                                  initial_value, false);
6607         }
6608       if (neutral_op)
6609         vector_identity = gimple_build_vector_from_val (&seq, vectype,
6610                                                         neutral_op);
6611       for (unsigned int i = 0; i < group_size; ++i)
6612         {
6613           /* If there's no univeral neutral value, we can use the
6614              initial scalar value from the original PHI.  This is used
6615              for MIN and MAX reduction, for example.  */
6616           if (!neutral_op)
6617             {
6618               tree scalar_value = reduc_info->reduc_initial_values[i];
6619               scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
6620                                              scalar_value);
6621               vector_identity = gimple_build_vector_from_val (&seq, vectype,
6622                                                               scalar_value);
6623             }
6624
6625           /* Calculate the equivalent of:
6626
6627              sel[j] = (index[j] == i);
6628
6629              which selects the elements of REDUC_INPUTS[0] that should
6630              be included in the result.  */
6631           tree compare_val = build_int_cst (index_elt_type, i);
6632           compare_val = build_vector_from_val (index_type, compare_val);
6633           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
6634                                    index, compare_val);
6635
6636           /* Calculate the equivalent of:
6637
6638              vec = seq ? reduc_inputs[0] : vector_identity;
6639
6640              VEC is now suitable for a full vector reduction.  */
6641           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
6642                                    sel, reduc_inputs[0], vector_identity);
6643
6644           /* Do the reduction and convert it to the appropriate type.  */
6645           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
6646                                       TREE_TYPE (vectype), vec);
6647           scalar = gimple_convert (&seq, scalar_type, scalar);
6648           scalar_results.safe_push (scalar);
6649         }
6650       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
6651     }
6652   else
6653     {
6654       bool reduce_with_shift;
6655       tree vec_temp;
6656
6657       gcc_assert (slp_reduc || reduc_inputs.length () == 1);
6658
6659       /* See if the target wants to do the final (shift) reduction
6660          in a vector mode of smaller size and first reduce upper/lower
6661          halves against each other.  */
6662       enum machine_mode mode1 = mode;
6663       tree stype = TREE_TYPE (vectype);
6664       unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6665       unsigned nunits1 = nunits;
6666       if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
6667           && reduc_inputs.length () == 1)
6668         {
6669           nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6670           /* For SLP reductions we have to make sure lanes match up, but
6671              since we're doing individual element final reduction reducing
6672              vector width here is even more important.
6673              ???  We can also separate lanes with permutes, for the common
6674              case of power-of-two group-size odd/even extracts would work.  */
6675           if (slp_reduc && nunits != nunits1)
6676             {
6677               nunits1 = least_common_multiple (nunits1, group_size);
6678               gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
6679             }
6680         }
6681       if (!slp_reduc
6682           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
6683         nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6684
6685       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
6686                                                            stype, nunits1);
6687       reduce_with_shift = have_whole_vector_shift (mode1);
6688       if (!VECTOR_MODE_P (mode1)
6689           || !directly_supported_p (code, vectype1))
6690         reduce_with_shift = false;
6691
6692       /* First reduce the vector to the desired vector size we should
6693          do shift reduction on by combining upper and lower halves.  */
6694       gimple_seq stmts = NULL;
6695       new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
6696                                              code, &stmts);
6697       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6698       reduc_inputs[0] = new_temp;
6699
6700       if (reduce_with_shift && !slp_reduc)
6701         {
6702           int element_bitsize = tree_to_uhwi (bitsize);
6703           /* Enforced by vectorizable_reduction, which disallows SLP reductions
6704              for variable-length vectors and also requires direct target support
6705              for loop reductions.  */
6706           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6707           int nelements = vec_size_in_bits / element_bitsize;
6708           vec_perm_builder sel;
6709           vec_perm_indices indices;
6710
6711           int elt_offset;
6712
6713           tree zero_vec = build_zero_cst (vectype1);
6714           /* Case 2: Create:
6715              for (offset = nelements/2; offset >= 1; offset/=2)
6716                 {
6717                   Create:  va' = vec_shift <va, offset>
6718                   Create:  va = vop <va, va'>
6719                 }  */
6720
6721           tree rhs;
6722
6723           if (dump_enabled_p ())
6724             dump_printf_loc (MSG_NOTE, vect_location,
6725                              "Reduce using vector shifts\n");
6726
6727           gimple_seq stmts = NULL;
6728           new_temp = gimple_convert (&stmts, vectype1, new_temp);
6729           for (elt_offset = nelements / 2;
6730                elt_offset >= 1;
6731                elt_offset /= 2)
6732             {
6733               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
6734               indices.new_vector (sel, 2, nelements);
6735               tree mask = vect_gen_perm_mask_any (vectype1, indices);
6736               new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
6737                                        new_temp, zero_vec, mask);
6738               new_temp = gimple_build (&stmts, code,
6739                                        vectype1, new_name, new_temp);
6740             }
6741           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6742
6743           /* 2.4  Extract the final scalar result.  Create:
6744              s_out3 = extract_field <v_out2, bitpos>  */
6745
6746           if (dump_enabled_p ())
6747             dump_printf_loc (MSG_NOTE, vect_location,
6748                              "extract scalar result\n");
6749
6750           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
6751                         bitsize, bitsize_zero_node);
6752           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
6753           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
6754           gimple_assign_set_lhs (epilog_stmt, new_temp);
6755           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6756           scalar_results.safe_push (new_temp);
6757         }
6758       else
6759         {
6760           /* Case 3: Create:
6761              s = extract_field <v_out2, 0>
6762              for (offset = element_size;
6763                   offset < vector_size;
6764                   offset += element_size;)
6765                {
6766                  Create:  s' = extract_field <v_out2, offset>
6767                  Create:  s = op <s, s'>  // For non SLP cases
6768                }  */
6769
6770           if (dump_enabled_p ())
6771             dump_printf_loc (MSG_NOTE, vect_location,
6772                              "Reduce using scalar code.\n");
6773
6774           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6775           int element_bitsize = tree_to_uhwi (bitsize);
6776           tree compute_type = TREE_TYPE (vectype);
6777           gimple_seq stmts = NULL;
6778           FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
6779             {
6780               int bit_offset;
6781               new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
6782                                        vec_temp, bitsize, bitsize_zero_node);
6783
6784               /* In SLP we don't need to apply reduction operation, so we just
6785                  collect s' values in SCALAR_RESULTS.  */
6786               if (slp_reduc)
6787                 scalar_results.safe_push (new_temp);
6788
6789               for (bit_offset = element_bitsize;
6790                    bit_offset < vec_size_in_bits;
6791                    bit_offset += element_bitsize)
6792                 {
6793                   tree bitpos = bitsize_int (bit_offset);
6794                   new_name = gimple_build (&stmts, BIT_FIELD_REF,
6795                                            compute_type, vec_temp,
6796                                            bitsize, bitpos);
6797                   if (slp_reduc)
6798                     {
6799                       /* In SLP we don't need to apply reduction operation, so
6800                          we just collect s' values in SCALAR_RESULTS.  */
6801                       new_temp = new_name;
6802                       scalar_results.safe_push (new_name);
6803                     }
6804                   else
6805                     new_temp = gimple_build (&stmts, code, compute_type,
6806                                              new_name, new_temp);
6807                 }
6808             }
6809
6810           /* The only case where we need to reduce scalar results in SLP, is
6811              unrolling.  If the size of SCALAR_RESULTS is greater than
6812              REDUC_GROUP_SIZE, we reduce them combining elements modulo
6813              REDUC_GROUP_SIZE.  */
6814           if (slp_reduc)
6815             {
6816               tree res, first_res, new_res;
6817
6818               /* Reduce multiple scalar results in case of SLP unrolling.  */
6819               for (j = group_size; scalar_results.iterate (j, &res);
6820                    j++)
6821                 {
6822                   first_res = scalar_results[j % group_size];
6823                   new_res = gimple_build (&stmts, code, compute_type,
6824                                           first_res, res);
6825                   scalar_results[j % group_size] = new_res;
6826                 }
6827               scalar_results.truncate (group_size);
6828               for (k = 0; k < group_size; k++)
6829                 scalar_results[k] = gimple_convert (&stmts, scalar_type,
6830                                                     scalar_results[k]);
6831             }
6832           else
6833             {
6834               /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
6835               new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6836               scalar_results.safe_push (new_temp);
6837             }
6838
6839           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6840         }
6841
6842       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6843           && induc_val)
6844         {
6845           /* Earlier we set the initial value to be a vector if induc_val
6846              values.  Check the result and if it is induc_val then replace
6847              with the original initial value, unless induc_val is
6848              the same as initial_def already.  */
6849           tree zcompare = make_ssa_name (boolean_type_node);
6850           epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR, new_temp,
6851                                              induc_val);
6852           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6853           tree initial_def = reduc_info->reduc_initial_values[0];
6854           tree tmp = make_ssa_name (new_scalar_dest);
6855           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6856                                              initial_def, new_temp);
6857           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6858           scalar_results[0] = tmp;
6859         }
6860     }
6861
6862   /* 2.5 Adjust the final result by the initial value of the reduction
6863          variable. (When such adjustment is not needed, then
6864          'adjustment_def' is zero).  For example, if code is PLUS we create:
6865          new_temp = loop_exit_def + adjustment_def  */
6866
6867   if (adjustment_def)
6868     {
6869       gcc_assert (!slp_reduc);
6870       gimple_seq stmts = NULL;
6871       if (double_reduc)
6872         {
6873           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6874           adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6875           new_temp = gimple_build (&stmts, code, vectype,
6876                                    reduc_inputs[0], adjustment_def);
6877         }
6878       else
6879         {
6880           new_temp = scalar_results[0];
6881           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6882           adjustment_def = gimple_convert (&stmts, TREE_TYPE (vectype),
6883                                            adjustment_def);
6884           new_temp = gimple_convert (&stmts, TREE_TYPE (vectype), new_temp);
6885           new_temp = gimple_build (&stmts, code, TREE_TYPE (vectype),
6886                                    new_temp, adjustment_def);
6887           new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6888         }
6889
6890       epilog_stmt = gimple_seq_last_stmt (stmts);
6891       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6892       scalar_results[0] = new_temp;
6893     }
6894
6895   /* Record this operation if it could be reused by the epilogue loop.  */
6896   if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION
6897       && reduc_inputs.length () == 1)
6898     loop_vinfo->reusable_accumulators.put (scalar_results[0],
6899                                            { orig_reduc_input, reduc_info });
6900
6901   if (double_reduc)
6902     loop = outer_loop;
6903
6904   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
6905           phis with new adjusted scalar results, i.e., replace use <s_out0>
6906           with use <s_out4>.
6907
6908      Transform:
6909         loop_exit:
6910           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6911           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6912           v_out2 = reduce <v_out1>
6913           s_out3 = extract_field <v_out2, 0>
6914           s_out4 = adjust_result <s_out3>
6915           use <s_out0>
6916           use <s_out0>
6917
6918      into:
6919
6920         loop_exit:
6921           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6922           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6923           v_out2 = reduce <v_out1>
6924           s_out3 = extract_field <v_out2, 0>
6925           s_out4 = adjust_result <s_out3>
6926           use <s_out4>
6927           use <s_out4> */
6928
6929   gcc_assert (live_out_stmts.size () == scalar_results.length ());
6930   for (k = 0; k < live_out_stmts.size (); k++)
6931     {
6932       stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6933       scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6934
6935       phis.create (3);
6936       /* Find the loop-closed-use at the loop exit of the original scalar
6937          result.  (The reduction result is expected to have two immediate uses,
6938          one at the latch block, and one at the loop exit).  For double
6939          reductions we are looking for exit phis of the outer loop.  */
6940       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6941         {
6942           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6943             {
6944               if (!is_gimple_debug (USE_STMT (use_p)))
6945                 phis.safe_push (USE_STMT (use_p));
6946             }
6947           else
6948             {
6949               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6950                 {
6951                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
6952
6953                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6954                     {
6955                       if (!flow_bb_inside_loop_p (loop,
6956                                              gimple_bb (USE_STMT (phi_use_p)))
6957                           && !is_gimple_debug (USE_STMT (phi_use_p)))
6958                         phis.safe_push (USE_STMT (phi_use_p));
6959                     }
6960                 }
6961             }
6962         }
6963
6964       FOR_EACH_VEC_ELT (phis, i, exit_phi)
6965         {
6966           /* Replace the uses:  */
6967           orig_name = PHI_RESULT (exit_phi);
6968
6969           /* Look for a single use at the target of the skip edge.  */
6970           if (unify_with_main_loop_p)
6971             {
6972               use_operand_p use_p;
6973               gimple *user;
6974               if (!single_imm_use (orig_name, &use_p, &user))
6975                 gcc_unreachable ();
6976               orig_name = gimple_get_lhs (user);
6977             }
6978
6979           scalar_result = scalar_results[k];
6980           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6981             {
6982               FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6983                 SET_USE (use_p, scalar_result);
6984               update_stmt (use_stmt);
6985             }
6986         }
6987
6988       phis.release ();
6989     }
6990 }
6991
6992 /* Return a vector of type VECTYPE that is equal to the vector select
6993    operation "MASK ? VEC : IDENTITY".  Insert the select statements
6994    before GSI.  */
6995
6996 static tree
6997 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6998                      tree vec, tree identity)
6999 {
7000   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
7001   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
7002                                           mask, vec, identity);
7003   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
7004   return cond;
7005 }
7006
7007 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
7008    order, starting with LHS.  Insert the extraction statements before GSI and
7009    associate the new scalar SSA names with variable SCALAR_DEST.
7010    If MASK is nonzero mask the input and then operate on it unconditionally.
7011    Return the SSA name for the result.  */
7012
7013 static tree
7014 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
7015                        tree_code code, tree lhs, tree vector_rhs,
7016                        tree mask)
7017 {
7018   tree vectype = TREE_TYPE (vector_rhs);
7019   tree scalar_type = TREE_TYPE (vectype);
7020   tree bitsize = TYPE_SIZE (scalar_type);
7021   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
7022   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
7023
7024   /* Re-create a VEC_COND_EXPR to mask the input here in order to be able
7025      to perform an unconditional element-wise reduction of it.  */
7026   if (mask)
7027     {
7028       tree masked_vector_rhs = make_temp_ssa_name (vectype, NULL,
7029                                                    "masked_vector_rhs");
7030       tree neutral_op = neutral_op_for_reduction (scalar_type, code, NULL_TREE,
7031                                                   false);
7032       tree vector_identity = build_vector_from_val (vectype, neutral_op);
7033       gassign *select = gimple_build_assign (masked_vector_rhs, VEC_COND_EXPR,
7034                                              mask, vector_rhs, vector_identity);
7035       gsi_insert_before (gsi, select, GSI_SAME_STMT);
7036       vector_rhs = masked_vector_rhs;
7037     }
7038
7039   for (unsigned HOST_WIDE_INT bit_offset = 0;
7040        bit_offset < vec_size_in_bits;
7041        bit_offset += element_bitsize)
7042     {
7043       tree bitpos = bitsize_int (bit_offset);
7044       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
7045                          bitsize, bitpos);
7046
7047       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
7048       rhs = make_ssa_name (scalar_dest, stmt);
7049       gimple_assign_set_lhs (stmt, rhs);
7050       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
7051
7052       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
7053       tree new_name = make_ssa_name (scalar_dest, stmt);
7054       gimple_assign_set_lhs (stmt, new_name);
7055       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
7056       lhs = new_name;
7057     }
7058   return lhs;
7059 }
7060
7061 /* Get a masked internal function equivalent to REDUC_FN.  VECTYPE_IN is the
7062    type of the vector input.  */
7063
7064 static internal_fn
7065 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
7066 {
7067   internal_fn mask_reduc_fn;
7068   internal_fn mask_len_reduc_fn;
7069
7070   switch (reduc_fn)
7071     {
7072     case IFN_FOLD_LEFT_PLUS:
7073       mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
7074       mask_len_reduc_fn = IFN_MASK_LEN_FOLD_LEFT_PLUS;
7075       break;
7076
7077     default:
7078       return IFN_LAST;
7079     }
7080
7081   if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
7082                                       OPTIMIZE_FOR_SPEED))
7083     return mask_reduc_fn;
7084   if (direct_internal_fn_supported_p (mask_len_reduc_fn, vectype_in,
7085                                       OPTIMIZE_FOR_SPEED))
7086     return mask_len_reduc_fn;
7087   return IFN_LAST;
7088 }
7089
7090 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
7091    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
7092    statement.  CODE is the operation performed by STMT_INFO and OPS are
7093    its scalar operands.  REDUC_INDEX is the index of the operand in
7094    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
7095    implements in-order reduction, or IFN_LAST if we should open-code it.
7096    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
7097    that should be used to control the operation in a fully-masked loop.  */
7098
7099 static bool
7100 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
7101                                stmt_vec_info stmt_info,
7102                                gimple_stmt_iterator *gsi,
7103                                gimple **vec_stmt, slp_tree slp_node,
7104                                gimple *reduc_def_stmt,
7105                                code_helper code, internal_fn reduc_fn,
7106                                tree *ops, int num_ops, tree vectype_in,
7107                                int reduc_index, vec_loop_masks *masks,
7108                                vec_loop_lens *lens)
7109 {
7110   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7111   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7112   internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
7113
7114   int ncopies;
7115   if (slp_node)
7116     ncopies = 1;
7117   else
7118     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7119
7120   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
7121   gcc_assert (ncopies == 1);
7122
7123   bool is_cond_op = false;
7124   if (!code.is_tree_code ())
7125     {
7126       code = conditional_internal_fn_code (internal_fn (code));
7127       gcc_assert (code != ERROR_MARK);
7128       is_cond_op = true;
7129     }
7130
7131   gcc_assert (TREE_CODE_LENGTH (tree_code (code)) == binary_op);
7132
7133   if (slp_node)
7134     {
7135       if (is_cond_op)
7136         {
7137           if (dump_enabled_p ())
7138             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7139                              "fold-left reduction on SLP not supported.\n");
7140           return false;
7141         }
7142
7143       gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
7144                             TYPE_VECTOR_SUBPARTS (vectype_in)));
7145     }
7146
7147   /* The operands either come from a binary operation or an IFN_COND operation.
7148      The former is a gimple assign with binary rhs and the latter is a
7149      gimple call with four arguments.  */
7150   gcc_assert (num_ops == 2 || num_ops == 4);
7151   tree op0, opmask;
7152   if (!is_cond_op)
7153     op0 = ops[1 - reduc_index];
7154   else
7155     {
7156       op0 = ops[2 + (1 - reduc_index)];
7157       opmask = ops[0];
7158       gcc_assert (!slp_node);
7159     }
7160
7161   int group_size = 1;
7162   stmt_vec_info scalar_dest_def_info;
7163   auto_vec<tree> vec_oprnds0, vec_opmask;
7164   if (slp_node)
7165     {
7166       auto_vec<vec<tree> > vec_defs (2);
7167       vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
7168       vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
7169       vec_defs[0].release ();
7170       vec_defs[1].release ();
7171       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7172       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
7173     }
7174   else
7175     {
7176       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7177                                      op0, &vec_oprnds0);
7178       scalar_dest_def_info = stmt_info;
7179
7180       /* For an IFN_COND_OP we also need the vector mask operand.  */
7181       if (is_cond_op)
7182           vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7183                                          opmask, &vec_opmask);
7184     }
7185
7186   gimple *sdef = vect_orig_stmt (scalar_dest_def_info)->stmt;
7187   tree scalar_dest = gimple_get_lhs (sdef);
7188   tree scalar_type = TREE_TYPE (scalar_dest);
7189   tree reduc_var = gimple_phi_result (reduc_def_stmt);
7190
7191   int vec_num = vec_oprnds0.length ();
7192   gcc_assert (vec_num == 1 || slp_node);
7193   tree vec_elem_type = TREE_TYPE (vectype_out);
7194   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
7195
7196   tree vector_identity = NULL_TREE;
7197   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7198     {
7199       vector_identity = build_zero_cst (vectype_out);
7200       if (!HONOR_SIGNED_ZEROS (vectype_out))
7201         ;
7202       else
7203         {
7204           gcc_assert (!HONOR_SIGN_DEPENDENT_ROUNDING (vectype_out));
7205           vector_identity = const_unop (NEGATE_EXPR, vectype_out,
7206                                         vector_identity);
7207         }
7208     }
7209
7210   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
7211   int i;
7212   tree def0;
7213   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7214     {
7215       gimple *new_stmt;
7216       tree mask = NULL_TREE;
7217       tree len = NULL_TREE;
7218       tree bias = NULL_TREE;
7219       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7220         mask = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num, vectype_in, i);
7221       else if (is_cond_op)
7222         mask = vec_opmask[0];
7223       if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
7224         {
7225           len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
7226                                    i, 1);
7227           signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
7228           bias = build_int_cst (intQI_type_node, biasval);
7229           if (!is_cond_op)
7230             mask = build_minus_one_cst (truth_type_for (vectype_in));
7231         }
7232
7233       /* Handle MINUS by adding the negative.  */
7234       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
7235         {
7236           tree negated = make_ssa_name (vectype_out);
7237           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
7238           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
7239           def0 = negated;
7240         }
7241
7242       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
7243           && mask && mask_reduc_fn == IFN_LAST)
7244         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
7245                                     vector_identity);
7246
7247       /* On the first iteration the input is simply the scalar phi
7248          result, and for subsequent iterations it is the output of
7249          the preceding operation.  */
7250       if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
7251         {
7252           if (mask && len && mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
7253             new_stmt = gimple_build_call_internal (mask_reduc_fn, 5, reduc_var,
7254                                                    def0, mask, len, bias);
7255           else if (mask && mask_reduc_fn == IFN_MASK_FOLD_LEFT_PLUS)
7256             new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
7257                                                    def0, mask);
7258           else
7259             new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
7260                                                    def0);
7261           /* For chained SLP reductions the output of the previous reduction
7262              operation serves as the input of the next. For the final statement
7263              the output cannot be a temporary - we reuse the original
7264              scalar destination of the last statement.  */
7265           if (i != vec_num - 1)
7266             {
7267               gimple_set_lhs (new_stmt, scalar_dest_var);
7268               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
7269               gimple_set_lhs (new_stmt, reduc_var);
7270             }
7271         }
7272       else
7273         {
7274           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var,
7275                                              tree_code (code), reduc_var, def0,
7276                                              mask);
7277           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
7278           /* Remove the statement, so that we can use the same code paths
7279              as for statements that we've just created.  */
7280           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
7281           gsi_remove (&tmp_gsi, true);
7282         }
7283
7284       if (i == vec_num - 1)
7285         {
7286           gimple_set_lhs (new_stmt, scalar_dest);
7287           vect_finish_replace_stmt (loop_vinfo,
7288                                     scalar_dest_def_info,
7289                                     new_stmt);
7290         }
7291       else
7292         vect_finish_stmt_generation (loop_vinfo,
7293                                      scalar_dest_def_info,
7294                                      new_stmt, gsi);
7295
7296       if (slp_node)
7297         slp_node->push_vec_def (new_stmt);
7298       else
7299         {
7300           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7301           *vec_stmt = new_stmt;
7302         }
7303     }
7304
7305   return true;
7306 }
7307
7308 /* Function is_nonwrapping_integer_induction.
7309
7310    Check if STMT_VINO (which is part of loop LOOP) both increments and
7311    does not cause overflow.  */
7312
7313 static bool
7314 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
7315 {
7316   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
7317   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
7318   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
7319   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
7320   widest_int ni, max_loop_value, lhs_max;
7321   wi::overflow_type overflow = wi::OVF_NONE;
7322
7323   /* Make sure the loop is integer based.  */
7324   if (TREE_CODE (base) != INTEGER_CST
7325       || TREE_CODE (step) != INTEGER_CST)
7326     return false;
7327
7328   /* Check that the max size of the loop will not wrap.  */
7329
7330   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
7331     return true;
7332
7333   if (! max_stmt_executions (loop, &ni))
7334     return false;
7335
7336   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
7337                             &overflow);
7338   if (overflow)
7339     return false;
7340
7341   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
7342                             TYPE_SIGN (lhs_type), &overflow);
7343   if (overflow)
7344     return false;
7345
7346   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
7347           <= TYPE_PRECISION (lhs_type));
7348 }
7349
7350 /* Check if masking can be supported by inserting a conditional expression.
7351    CODE is the code for the operation.  COND_FN is the conditional internal
7352    function, if it exists.  VECTYPE_IN is the type of the vector input.  */
7353 static bool
7354 use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
7355                          tree vectype_in)
7356 {
7357   if (cond_fn != IFN_LAST
7358       && direct_internal_fn_supported_p (cond_fn, vectype_in,
7359                                          OPTIMIZE_FOR_SPEED))
7360     return false;
7361
7362   if (code.is_tree_code ())
7363     switch (tree_code (code))
7364       {
7365       case DOT_PROD_EXPR:
7366       case SAD_EXPR:
7367         return true;
7368
7369       default:
7370         break;
7371       }
7372   return false;
7373 }
7374
7375 /* Insert a conditional expression to enable masked vectorization.  CODE is the
7376    code for the operation.  VOP is the array of operands.  MASK is the loop
7377    mask.  GSI is a statement iterator used to place the new conditional
7378    expression.  */
7379 static void
7380 build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
7381                       gimple_stmt_iterator *gsi)
7382 {
7383   switch (tree_code (code))
7384     {
7385     case DOT_PROD_EXPR:
7386       {
7387         tree vectype = TREE_TYPE (vop[1]);
7388         tree zero = build_zero_cst (vectype);
7389         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7390         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7391                                                mask, vop[1], zero);
7392         gsi_insert_before (gsi, select, GSI_SAME_STMT);
7393         vop[1] = masked_op1;
7394         break;
7395       }
7396
7397     case SAD_EXPR:
7398       {
7399         tree vectype = TREE_TYPE (vop[1]);
7400         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7401         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7402                                                mask, vop[1], vop[0]);
7403         gsi_insert_before (gsi, select, GSI_SAME_STMT);
7404         vop[1] = masked_op1;
7405         break;
7406       }
7407
7408     default:
7409       gcc_unreachable ();
7410     }
7411 }
7412
7413 /* Function vectorizable_reduction.
7414
7415    Check if STMT_INFO performs a reduction operation that can be vectorized.
7416    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
7417    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
7418    Return true if STMT_INFO is vectorizable in this way.
7419
7420    This function also handles reduction idioms (patterns) that have been
7421    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
7422    may be of this form:
7423      X = pattern_expr (arg0, arg1, ..., X)
7424    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
7425    sequence that had been detected and replaced by the pattern-stmt
7426    (STMT_INFO).
7427
7428    This function also handles reduction of condition expressions, for example:
7429      for (int i = 0; i < N; i++)
7430        if (a[i] < value)
7431          last = a[i];
7432    This is handled by vectorising the loop and creating an additional vector
7433    containing the loop indexes for which "a[i] < value" was true.  In the
7434    function epilogue this is reduced to a single max value and then used to
7435    index into the vector of results.
7436
7437    In some cases of reduction patterns, the type of the reduction variable X is
7438    different than the type of the other arguments of STMT_INFO.
7439    In such cases, the vectype that is used when transforming STMT_INFO into
7440    a vector stmt is different than the vectype that is used to determine the
7441    vectorization factor, because it consists of a different number of elements
7442    than the actual number of elements that are being operated upon in parallel.
7443
7444    For example, consider an accumulation of shorts into an int accumulator.
7445    On some targets it's possible to vectorize this pattern operating on 8
7446    shorts at a time (hence, the vectype for purposes of determining the
7447    vectorization factor should be V8HI); on the other hand, the vectype that
7448    is used to create the vector form is actually V4SI (the type of the result).
7449
7450    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
7451    indicates what is the actual level of parallelism (V8HI in the example), so
7452    that the right vectorization factor would be derived.  This vectype
7453    corresponds to the type of arguments to the reduction stmt, and should *NOT*
7454    be used to create the vectorized stmt.  The right vectype for the vectorized
7455    stmt is obtained from the type of the result X:
7456       get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7457
7458    This means that, contrary to "regular" reductions (or "regular" stmts in
7459    general), the following equation:
7460       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7461    does *NOT* necessarily hold for reduction patterns.  */
7462
7463 bool
7464 vectorizable_reduction (loop_vec_info loop_vinfo,
7465                         stmt_vec_info stmt_info, slp_tree slp_node,
7466                         slp_instance slp_node_instance,
7467                         stmt_vector_for_cost *cost_vec)
7468 {
7469   tree vectype_in = NULL_TREE;
7470   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7471   enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
7472   stmt_vec_info cond_stmt_vinfo = NULL;
7473   int i;
7474   int ncopies;
7475   bool single_defuse_cycle = false;
7476   bool nested_cycle = false;
7477   bool double_reduc = false;
7478   int vec_num;
7479   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
7480   tree cond_reduc_val = NULL_TREE;
7481
7482   /* Make sure it was already recognized as a reduction computation.  */
7483   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
7484       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
7485       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
7486     return false;
7487
7488   /* The stmt we store reduction analysis meta on.  */
7489   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7490   reduc_info->is_reduc_info = true;
7491
7492   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
7493     {
7494       if (is_a <gphi *> (stmt_info->stmt))
7495         {
7496           if (slp_node)
7497             {
7498               /* We eventually need to set a vector type on invariant
7499                  arguments.  */
7500               unsigned j;
7501               slp_tree child;
7502               FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
7503                 if (!vect_maybe_update_slp_op_vectype
7504                        (child, SLP_TREE_VECTYPE (slp_node)))
7505                   {
7506                     if (dump_enabled_p ())
7507                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7508                                        "incompatible vector types for "
7509                                        "invariants\n");
7510                     return false;
7511                   }
7512             }
7513           /* Analysis for double-reduction is done on the outer
7514              loop PHI, nested cycles have no further restrictions.  */
7515           STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
7516         }
7517       else
7518         STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7519       return true;
7520     }
7521
7522   stmt_vec_info orig_stmt_of_analysis = stmt_info;
7523   stmt_vec_info phi_info = stmt_info;
7524   if (!is_a <gphi *> (stmt_info->stmt))
7525     {
7526       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7527       return true;
7528     }
7529   if (slp_node)
7530     {
7531       slp_node_instance->reduc_phis = slp_node;
7532       /* ???  We're leaving slp_node to point to the PHIs, we only
7533          need it to get at the number of vector stmts which wasn't
7534          yet initialized for the instance root.  */
7535     }
7536   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7537     {
7538       use_operand_p use_p;
7539       gimple *use_stmt;
7540       bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
7541                                  &use_p, &use_stmt);
7542       gcc_assert (res);
7543       phi_info = loop_vinfo->lookup_stmt (use_stmt);
7544     }
7545
7546   /* PHIs should not participate in patterns.  */
7547   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7548   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7549
7550   /* Verify following REDUC_IDX from the latch def leads us back to the PHI
7551      and compute the reduction chain length.  Discover the real
7552      reduction operation stmt on the way (stmt_info and slp_for_stmt_info).  */
7553   tree reduc_def
7554     = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
7555                              loop_latch_edge
7556                                (gimple_bb (reduc_def_phi)->loop_father));
7557   unsigned reduc_chain_length = 0;
7558   bool only_slp_reduc_chain = true;
7559   stmt_info = NULL;
7560   slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
7561   while (reduc_def != PHI_RESULT (reduc_def_phi))
7562     {
7563       stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
7564       stmt_vec_info vdef = vect_stmt_to_vectorize (def);
7565       if (STMT_VINFO_REDUC_IDX (vdef) == -1)
7566         {
7567           if (dump_enabled_p ())
7568             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7569                              "reduction chain broken by patterns.\n");
7570           return false;
7571         }
7572       if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
7573         only_slp_reduc_chain = false;
7574       /* For epilogue generation live members of the chain need
7575          to point back to the PHI via their original stmt for
7576          info_for_reduction to work.  For SLP we need to look at
7577          all lanes here - even though we only will vectorize from
7578          the SLP node with live lane zero the other live lanes also
7579          need to be identified as part of a reduction to be able
7580          to skip code generation for them.  */
7581       if (slp_for_stmt_info)
7582         {
7583           for (auto s : SLP_TREE_SCALAR_STMTS (slp_for_stmt_info))
7584             if (STMT_VINFO_LIVE_P (s))
7585               STMT_VINFO_REDUC_DEF (vect_orig_stmt (s)) = phi_info;
7586         }
7587       else if (STMT_VINFO_LIVE_P (vdef))
7588         STMT_VINFO_REDUC_DEF (def) = phi_info;
7589       gimple_match_op op;
7590       if (!gimple_extract_op (vdef->stmt, &op))
7591         {
7592           if (dump_enabled_p ())
7593             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7594                              "reduction chain includes unsupported"
7595                              " statement type.\n");
7596           return false;
7597         }
7598       if (CONVERT_EXPR_CODE_P (op.code))
7599         {
7600           if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
7601             {
7602               if (dump_enabled_p ())
7603                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7604                                  "conversion in the reduction chain.\n");
7605               return false;
7606             }
7607         }
7608       else if (!stmt_info)
7609         /* First non-conversion stmt.  */
7610         stmt_info = vdef;
7611       reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
7612       reduc_chain_length++;
7613       if (!stmt_info && slp_node)
7614         slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
7615     }
7616   /* PHIs should not participate in patterns.  */
7617   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7618
7619   if (nested_in_vect_loop_p (loop, stmt_info))
7620     {
7621       loop = loop->inner;
7622       nested_cycle = true;
7623     }
7624
7625   /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
7626      element.  */
7627   if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7628     {
7629       gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
7630       stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
7631     }
7632   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7633     gcc_assert (slp_node
7634                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
7635
7636   /* 1. Is vectorizable reduction?  */
7637   /* Not supportable if the reduction variable is used in the loop, unless
7638      it's a reduction chain.  */
7639   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
7640       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7641     return false;
7642
7643   /* Reductions that are not used even in an enclosing outer-loop,
7644      are expected to be "live" (used out of the loop).  */
7645   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
7646       && !STMT_VINFO_LIVE_P (stmt_info))
7647     return false;
7648
7649   /* 2. Has this been recognized as a reduction pattern?
7650
7651      Check if STMT represents a pattern that has been recognized
7652      in earlier analysis stages.  For stmts that represent a pattern,
7653      the STMT_VINFO_RELATED_STMT field records the last stmt in
7654      the original sequence that constitutes the pattern.  */
7655
7656   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
7657   if (orig_stmt_info)
7658     {
7659       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
7660       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
7661     }
7662
7663   /* 3. Check the operands of the operation.  The first operands are defined
7664         inside the loop body. The last operand is the reduction variable,
7665         which is defined by the loop-header-phi.  */
7666
7667   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7668   STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
7669   gimple_match_op op;
7670   if (!gimple_extract_op (stmt_info->stmt, &op))
7671     gcc_unreachable ();
7672   bool lane_reduc_code_p = (op.code == DOT_PROD_EXPR
7673                             || op.code == WIDEN_SUM_EXPR
7674                             || op.code == SAD_EXPR);
7675
7676   if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
7677       && !SCALAR_FLOAT_TYPE_P (op.type))
7678     return false;
7679
7680   /* Do not try to vectorize bit-precision reductions.  */
7681   if (!type_has_mode_precision_p (op.type))
7682     return false;
7683
7684   /* For lane-reducing ops we're reducing the number of reduction PHIs
7685      which means the only use of that may be in the lane-reducing operation.  */
7686   if (lane_reduc_code_p
7687       && reduc_chain_length != 1
7688       && !only_slp_reduc_chain)
7689     {
7690       if (dump_enabled_p ())
7691         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7692                          "lane-reducing reduction with extra stmts.\n");
7693       return false;
7694     }
7695
7696   /* All uses but the last are expected to be defined in the loop.
7697      The last use is the reduction variable.  In case of nested cycle this
7698      assumption is not true: we use reduc_index to record the index of the
7699      reduction variable.  */
7700   slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
7701   tree *vectype_op = XALLOCAVEC (tree, op.num_ops);
7702   /* We need to skip an extra operand for COND_EXPRs with embedded
7703      comparison.  */
7704   unsigned opno_adjust = 0;
7705   if (op.code == COND_EXPR && COMPARISON_CLASS_P (op.ops[0]))
7706     opno_adjust = 1;
7707   for (i = 0; i < (int) op.num_ops; i++)
7708     {
7709       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
7710       if (i == 0 && op.code == COND_EXPR)
7711         continue;
7712
7713       stmt_vec_info def_stmt_info;
7714       enum vect_def_type dt;
7715       if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
7716                                i + opno_adjust, &op.ops[i], &slp_op[i], &dt,
7717                                &vectype_op[i], &def_stmt_info))
7718         {
7719           if (dump_enabled_p ())
7720             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7721                              "use not simple.\n");
7722           return false;
7723         }
7724       if (i == STMT_VINFO_REDUC_IDX (stmt_info))
7725         continue;
7726
7727       /* For an IFN_COND_OP we might hit the reduction definition operand
7728          twice (once as definition, once as else).  */
7729       if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
7730         continue;
7731
7732       /* There should be only one cycle def in the stmt, the one
7733          leading to reduc_def.  */
7734       if (VECTORIZABLE_CYCLE_DEF (dt))
7735         return false;
7736
7737       if (!vectype_op[i])
7738         vectype_op[i]
7739           = get_vectype_for_scalar_type (loop_vinfo,
7740                                          TREE_TYPE (op.ops[i]), slp_op[i]);
7741
7742       /* To properly compute ncopies we are interested in the widest
7743          non-reduction input type in case we're looking at a widening
7744          accumulation that we later handle in vect_transform_reduction.  */
7745       if (lane_reduc_code_p
7746           && vectype_op[i]
7747           && (!vectype_in
7748               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
7749                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_op[i]))))))
7750         vectype_in = vectype_op[i];
7751
7752       if (op.code == COND_EXPR)
7753         {
7754           /* Record how the non-reduction-def value of COND_EXPR is defined.  */
7755           if (dt == vect_constant_def)
7756             {
7757               cond_reduc_dt = dt;
7758               cond_reduc_val = op.ops[i];
7759             }
7760           if (dt == vect_induction_def
7761               && def_stmt_info
7762               && is_nonwrapping_integer_induction (def_stmt_info, loop))
7763             {
7764               cond_reduc_dt = dt;
7765               cond_stmt_vinfo = def_stmt_info;
7766             }
7767         }
7768     }
7769   if (!vectype_in)
7770     vectype_in = STMT_VINFO_VECTYPE (phi_info);
7771   STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
7772
7773   enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
7774   STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
7775   /* If we have a condition reduction, see if we can simplify it further.  */
7776   if (v_reduc_type == COND_REDUCTION)
7777     {
7778       if (slp_node)
7779         return false;
7780
7781       /* When the condition uses the reduction value in the condition, fail.  */
7782       if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
7783         {
7784           if (dump_enabled_p ())
7785             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7786                              "condition depends on previous iteration\n");
7787           return false;
7788         }
7789
7790       if (reduc_chain_length == 1
7791           && (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST, vectype_in,
7792                                               OPTIMIZE_FOR_SPEED)
7793               || direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
7794                                                  vectype_in,
7795                                                  OPTIMIZE_FOR_SPEED)))
7796         {
7797           if (dump_enabled_p ())
7798             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7799                              "optimizing condition reduction with"
7800                              " FOLD_EXTRACT_LAST.\n");
7801           STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
7802         }
7803       else if (cond_reduc_dt == vect_induction_def)
7804         {
7805           tree base
7806             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
7807           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
7808
7809           gcc_assert (TREE_CODE (base) == INTEGER_CST
7810                       && TREE_CODE (step) == INTEGER_CST);
7811           cond_reduc_val = NULL_TREE;
7812           enum tree_code cond_reduc_op_code = ERROR_MARK;
7813           tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
7814           if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
7815             ;
7816           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
7817              above base; punt if base is the minimum value of the type for
7818              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
7819           else if (tree_int_cst_sgn (step) == -1)
7820             {
7821               cond_reduc_op_code = MIN_EXPR;
7822               if (tree_int_cst_sgn (base) == -1)
7823                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7824               else if (tree_int_cst_lt (base,
7825                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
7826                 cond_reduc_val
7827                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
7828             }
7829           else
7830             {
7831               cond_reduc_op_code = MAX_EXPR;
7832               if (tree_int_cst_sgn (base) == 1)
7833                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7834               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
7835                                         base))
7836                 cond_reduc_val
7837                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
7838             }
7839           if (cond_reduc_val)
7840             {
7841               if (dump_enabled_p ())
7842                 dump_printf_loc (MSG_NOTE, vect_location,
7843                                  "condition expression based on "
7844                                  "integer induction.\n");
7845               STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
7846               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
7847                 = cond_reduc_val;
7848               STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
7849             }
7850         }
7851       else if (cond_reduc_dt == vect_constant_def)
7852         {
7853           enum vect_def_type cond_initial_dt;
7854           tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
7855           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
7856           if (cond_initial_dt == vect_constant_def
7857               && types_compatible_p (TREE_TYPE (cond_initial_val),
7858                                      TREE_TYPE (cond_reduc_val)))
7859             {
7860               tree e = fold_binary (LE_EXPR, boolean_type_node,
7861                                     cond_initial_val, cond_reduc_val);
7862               if (e && (integer_onep (e) || integer_zerop (e)))
7863                 {
7864                   if (dump_enabled_p ())
7865                     dump_printf_loc (MSG_NOTE, vect_location,
7866                                      "condition expression based on "
7867                                      "compile time constant.\n");
7868                   /* Record reduction code at analysis stage.  */
7869                   STMT_VINFO_REDUC_CODE (reduc_info)
7870                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
7871                   STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
7872                 }
7873             }
7874         }
7875     }
7876
7877   if (STMT_VINFO_LIVE_P (phi_info))
7878     return false;
7879
7880   if (slp_node)
7881     ncopies = 1;
7882   else
7883     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7884
7885   gcc_assert (ncopies >= 1);
7886
7887   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
7888
7889   if (nested_cycle)
7890     {
7891       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
7892                   == vect_double_reduction_def);
7893       double_reduc = true;
7894     }
7895
7896   /* 4.2. Check support for the epilog operation.
7897
7898           If STMT represents a reduction pattern, then the type of the
7899           reduction variable may be different than the type of the rest
7900           of the arguments.  For example, consider the case of accumulation
7901           of shorts into an int accumulator; The original code:
7902                         S1: int_a = (int) short_a;
7903           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
7904
7905           was replaced with:
7906                         STMT: int_acc = widen_sum <short_a, int_acc>
7907
7908           This means that:
7909           1. The tree-code that is used to create the vector operation in the
7910              epilog code (that reduces the partial results) is not the
7911              tree-code of STMT, but is rather the tree-code of the original
7912              stmt from the pattern that STMT is replacing.  I.e, in the example
7913              above we want to use 'widen_sum' in the loop, but 'plus' in the
7914              epilog.
7915           2. The type (mode) we use to check available target support
7916              for the vector operation to be created in the *epilog*, is
7917              determined by the type of the reduction variable (in the example
7918              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7919              However the type (mode) we use to check available target support
7920              for the vector operation to be created *inside the loop*, is
7921              determined by the type of the other arguments to STMT (in the
7922              example we'd check this: optab_handler (widen_sum_optab,
7923              vect_short_mode)).
7924
7925           This is contrary to "regular" reductions, in which the types of all
7926           the arguments are the same as the type of the reduction variable.
7927           For "regular" reductions we can therefore use the same vector type
7928           (and also the same tree-code) when generating the epilog code and
7929           when generating the code inside the loop.  */
7930
7931   code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
7932
7933   /* If conversion might have created a conditional operation like
7934      IFN_COND_ADD already.  Use the internal code for the following checks.  */
7935   if (orig_code.is_internal_fn ())
7936     {
7937       tree_code new_code = conditional_internal_fn_code (internal_fn (orig_code));
7938       orig_code = new_code != ERROR_MARK ? new_code : orig_code;
7939     }
7940
7941   STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
7942
7943   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7944   if (reduction_type == TREE_CODE_REDUCTION)
7945     {
7946       /* Check whether it's ok to change the order of the computation.
7947          Generally, when vectorizing a reduction we change the order of the
7948          computation.  This may change the behavior of the program in some
7949          cases, so we need to check that this is ok.  One exception is when
7950          vectorizing an outer-loop: the inner-loop is executed sequentially,
7951          and therefore vectorizing reductions in the inner-loop during
7952          outer-loop vectorization is safe.  Likewise when we are vectorizing
7953          a series of reductions using SLP and the VF is one the reductions
7954          are performed in scalar order.  */
7955       if (slp_node
7956           && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7957           && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
7958         ;
7959       else if (needs_fold_left_reduction_p (op.type, orig_code))
7960         {
7961           /* When vectorizing a reduction chain w/o SLP the reduction PHI
7962              is not directy used in stmt.  */
7963           if (!only_slp_reduc_chain
7964               && reduc_chain_length != 1)
7965             {
7966               if (dump_enabled_p ())
7967                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7968                                  "in-order reduction chain without SLP.\n");
7969               return false;
7970             }
7971           STMT_VINFO_REDUC_TYPE (reduc_info)
7972             = reduction_type = FOLD_LEFT_REDUCTION;
7973         }
7974       else if (!commutative_binary_op_p (orig_code, op.type)
7975                || !associative_binary_op_p (orig_code, op.type))
7976         {
7977           if (dump_enabled_p ())
7978             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7979                             "reduction: not commutative/associative\n");
7980           return false;
7981         }
7982     }
7983
7984   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7985       && ncopies > 1)
7986     {
7987       if (dump_enabled_p ())
7988         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7989                          "multiple types in double reduction or condition "
7990                          "reduction or fold-left reduction.\n");
7991       return false;
7992     }
7993
7994   internal_fn reduc_fn = IFN_LAST;
7995   if (reduction_type == TREE_CODE_REDUCTION
7996       || reduction_type == FOLD_LEFT_REDUCTION
7997       || reduction_type == INTEGER_INDUC_COND_REDUCTION
7998       || reduction_type == CONST_COND_REDUCTION)
7999     {
8000       if (reduction_type == FOLD_LEFT_REDUCTION
8001           ? fold_left_reduction_fn (orig_code, &reduc_fn)
8002           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
8003         {
8004           if (reduc_fn != IFN_LAST
8005               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
8006                                                   OPTIMIZE_FOR_SPEED))
8007             {
8008               if (dump_enabled_p ())
8009                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8010                                  "reduc op not supported by target.\n");
8011
8012               reduc_fn = IFN_LAST;
8013             }
8014         }
8015       else
8016         {
8017           if (!nested_cycle || double_reduc)
8018             {
8019               if (dump_enabled_p ())
8020                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8021                                  "no reduc code for scalar code.\n");
8022
8023               return false;
8024             }
8025         }
8026     }
8027   else if (reduction_type == COND_REDUCTION)
8028     {
8029       int scalar_precision
8030         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
8031       cr_index_scalar_type = make_unsigned_type (scalar_precision);
8032       cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
8033                                                 vectype_out);
8034
8035       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
8036                                           OPTIMIZE_FOR_SPEED))
8037         reduc_fn = IFN_REDUC_MAX;
8038     }
8039   STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
8040
8041   if (reduction_type != EXTRACT_LAST_REDUCTION
8042       && (!nested_cycle || double_reduc)
8043       && reduc_fn == IFN_LAST
8044       && !nunits_out.is_constant ())
8045     {
8046       if (dump_enabled_p ())
8047         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8048                          "missing target support for reduction on"
8049                          " variable-length vectors.\n");
8050       return false;
8051     }
8052
8053   /* For SLP reductions, see if there is a neutral value we can use.  */
8054   tree neutral_op = NULL_TREE;
8055   if (slp_node)
8056     {
8057       tree initial_value = NULL_TREE;
8058       if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
8059         initial_value = vect_phi_initial_value (reduc_def_phi);
8060       neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8061                                              orig_code, initial_value);
8062     }
8063
8064   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
8065     {
8066       /* We can't support in-order reductions of code such as this:
8067
8068            for (int i = 0; i < n1; ++i)
8069              for (int j = 0; j < n2; ++j)
8070                l += a[j];
8071
8072          since GCC effectively transforms the loop when vectorizing:
8073
8074            for (int i = 0; i < n1 / VF; ++i)
8075              for (int j = 0; j < n2; ++j)
8076                for (int k = 0; k < VF; ++k)
8077                  l += a[j];
8078
8079          which is a reassociation of the original operation.  */
8080       if (dump_enabled_p ())
8081         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8082                          "in-order double reduction not supported.\n");
8083
8084       return false;
8085     }
8086
8087   if (reduction_type == FOLD_LEFT_REDUCTION
8088       && slp_node
8089       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
8090     {
8091       /* We cannot use in-order reductions in this case because there is
8092          an implicit reassociation of the operations involved.  */
8093       if (dump_enabled_p ())
8094         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8095                          "in-order unchained SLP reductions not supported.\n");
8096       return false;
8097     }
8098
8099   /* For double reductions, and for SLP reductions with a neutral value,
8100      we construct a variable-length initial vector by loading a vector
8101      full of the neutral value and then shift-and-inserting the start
8102      values into the low-numbered elements.  */
8103   if ((double_reduc || neutral_op)
8104       && !nunits_out.is_constant ()
8105       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
8106                                           vectype_out, OPTIMIZE_FOR_SPEED))
8107     {
8108       if (dump_enabled_p ())
8109         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8110                          "reduction on variable-length vectors requires"
8111                          " target support for a vector-shift-and-insert"
8112                          " operation.\n");
8113       return false;
8114     }
8115
8116   /* Check extra constraints for variable-length unchained SLP reductions.  */
8117   if (slp_node
8118       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
8119       && !nunits_out.is_constant ())
8120     {
8121       /* We checked above that we could build the initial vector when
8122          there's a neutral element value.  Check here for the case in
8123          which each SLP statement has its own initial value and in which
8124          that value needs to be repeated for every instance of the
8125          statement within the initial vector.  */
8126       unsigned int group_size = SLP_TREE_LANES (slp_node);
8127       if (!neutral_op
8128           && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
8129                                               TREE_TYPE (vectype_out)))
8130         {
8131           if (dump_enabled_p ())
8132             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8133                              "unsupported form of SLP reduction for"
8134                              " variable-length vectors: cannot build"
8135                              " initial vector.\n");
8136           return false;
8137         }
8138       /* The epilogue code relies on the number of elements being a multiple
8139          of the group size.  The duplicate-and-interleave approach to setting
8140          up the initial vector does too.  */
8141       if (!multiple_p (nunits_out, group_size))
8142         {
8143           if (dump_enabled_p ())
8144             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8145                              "unsupported form of SLP reduction for"
8146                              " variable-length vectors: the vector size"
8147                              " is not a multiple of the number of results.\n");
8148           return false;
8149         }
8150     }
8151
8152   if (reduction_type == COND_REDUCTION)
8153     {
8154       widest_int ni;
8155
8156       if (! max_loop_iterations (loop, &ni))
8157         {
8158           if (dump_enabled_p ())
8159             dump_printf_loc (MSG_NOTE, vect_location,
8160                              "loop count not known, cannot create cond "
8161                              "reduction.\n");
8162           return false;
8163         }
8164       /* Convert backedges to iterations.  */
8165       ni += 1;
8166
8167       /* The additional index will be the same type as the condition.  Check
8168          that the loop can fit into this less one (because we'll use up the
8169          zero slot for when there are no matches).  */
8170       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
8171       if (wi::geu_p (ni, wi::to_widest (max_index)))
8172         {
8173           if (dump_enabled_p ())
8174             dump_printf_loc (MSG_NOTE, vect_location,
8175                              "loop size is greater than data size.\n");
8176           return false;
8177         }
8178     }
8179
8180   /* In case the vectorization factor (VF) is bigger than the number
8181      of elements that we can fit in a vectype (nunits), we have to generate
8182      more than one vector stmt - i.e - we need to "unroll" the
8183      vector stmt by a factor VF/nunits.  For more details see documentation
8184      in vectorizable_operation.  */
8185
8186   /* If the reduction is used in an outer loop we need to generate
8187      VF intermediate results, like so (e.g. for ncopies=2):
8188         r0 = phi (init, r0)
8189         r1 = phi (init, r1)
8190         r0 = x0 + r0;
8191         r1 = x1 + r1;
8192     (i.e. we generate VF results in 2 registers).
8193     In this case we have a separate def-use cycle for each copy, and therefore
8194     for each copy we get the vector def for the reduction variable from the
8195     respective phi node created for this copy.
8196
8197     Otherwise (the reduction is unused in the loop nest), we can combine
8198     together intermediate results, like so (e.g. for ncopies=2):
8199         r = phi (init, r)
8200         r = x0 + r;
8201         r = x1 + r;
8202    (i.e. we generate VF/2 results in a single register).
8203    In this case for each copy we get the vector def for the reduction variable
8204    from the vectorized reduction operation generated in the previous iteration.
8205
8206    This only works when we see both the reduction PHI and its only consumer
8207    in vectorizable_reduction and there are no intermediate stmts
8208    participating.  When unrolling we want each unrolled iteration to have its
8209    own reduction accumulator since one of the main goals of unrolling a
8210    reduction is to reduce the aggregate loop-carried latency.  */
8211   if (ncopies > 1
8212       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
8213       && reduc_chain_length == 1
8214       && loop_vinfo->suggested_unroll_factor == 1)
8215     single_defuse_cycle = true;
8216
8217   if (single_defuse_cycle || lane_reduc_code_p)
8218     {
8219       gcc_assert (op.code != COND_EXPR);
8220
8221       /* 4. Supportable by target?  */
8222       bool ok = true;
8223
8224       /* 4.1. check support for the operation in the loop
8225
8226          This isn't necessary for the lane reduction codes, since they
8227          can only be produced by pattern matching, and it's up to the
8228          pattern matcher to test for support.  The main reason for
8229          specifically skipping this step is to avoid rechecking whether
8230          mixed-sign dot-products can be implemented using signed
8231          dot-products.  */
8232       machine_mode vec_mode = TYPE_MODE (vectype_in);
8233       if (!lane_reduc_code_p
8234           && !directly_supported_p (op.code, vectype_in, optab_vector))
8235         {
8236           if (dump_enabled_p ())
8237             dump_printf (MSG_NOTE, "op not supported by target.\n");
8238           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
8239               || !vect_can_vectorize_without_simd_p (op.code))
8240             ok = false;
8241           else
8242             if (dump_enabled_p ())
8243               dump_printf (MSG_NOTE, "proceeding using word mode.\n");
8244         }
8245
8246       if (vect_emulated_vector_p (vectype_in)
8247           && !vect_can_vectorize_without_simd_p (op.code))
8248         {
8249           if (dump_enabled_p ())
8250             dump_printf (MSG_NOTE, "using word mode not possible.\n");
8251           return false;
8252         }
8253
8254       /* lane-reducing operations have to go through vect_transform_reduction.
8255          For the other cases try without the single cycle optimization.  */
8256       if (!ok)
8257         {
8258           if (lane_reduc_code_p)
8259             return false;
8260           else
8261             single_defuse_cycle = false;
8262         }
8263     }
8264   STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
8265
8266   /* If the reduction stmt is one of the patterns that have lane
8267      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
8268   if ((ncopies > 1 && ! single_defuse_cycle)
8269       && lane_reduc_code_p)
8270     {
8271       if (dump_enabled_p ())
8272         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8273                          "multi def-use cycle not possible for lane-reducing "
8274                          "reduction operation\n");
8275       return false;
8276     }
8277
8278   if (slp_node
8279       && !(!single_defuse_cycle
8280            && !lane_reduc_code_p
8281            && reduction_type != FOLD_LEFT_REDUCTION))
8282     for (i = 0; i < (int) op.num_ops; i++)
8283       if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
8284         {
8285           if (dump_enabled_p ())
8286             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8287                              "incompatible vector types for invariants\n");
8288           return false;
8289         }
8290
8291   if (slp_node)
8292     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8293   else
8294     vec_num = 1;
8295
8296   vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
8297                              reduction_type, ncopies, cost_vec);
8298   /* Cost the reduction op inside the loop if transformed via
8299      vect_transform_reduction.  Otherwise this is costed by the
8300      separate vectorizable_* routines.  */
8301   if (single_defuse_cycle || lane_reduc_code_p)
8302     {
8303       int factor = 1;
8304       if (vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info))
8305         /* Three dot-products and a subtraction.  */
8306         factor = 4;
8307       record_stmt_cost (cost_vec, ncopies * factor, vector_stmt,
8308                         stmt_info, 0, vect_body);
8309     }
8310
8311   if (dump_enabled_p ()
8312       && reduction_type == FOLD_LEFT_REDUCTION)
8313     dump_printf_loc (MSG_NOTE, vect_location,
8314                      "using an in-order (fold-left) reduction.\n");
8315   STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
8316   /* All but single defuse-cycle optimized, lane-reducing and fold-left
8317      reductions go through their own vectorizable_* routines.  */
8318   if (!single_defuse_cycle
8319       && !lane_reduc_code_p
8320       && reduction_type != FOLD_LEFT_REDUCTION)
8321     {
8322       stmt_vec_info tem
8323         = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
8324       if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
8325         {
8326           gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
8327           tem = REDUC_GROUP_FIRST_ELEMENT (tem);
8328         }
8329       STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
8330       STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
8331     }
8332   else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8333     {
8334       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8335       vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8336       internal_fn cond_fn = get_conditional_internal_fn (op.code, op.type);
8337
8338       if (reduction_type != FOLD_LEFT_REDUCTION
8339           && !use_mask_by_cond_expr_p (op.code, cond_fn, vectype_in)
8340           && (cond_fn == IFN_LAST
8341               || !direct_internal_fn_supported_p (cond_fn, vectype_in,
8342                                                   OPTIMIZE_FOR_SPEED)))
8343         {
8344           if (dump_enabled_p ())
8345             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8346                              "can't operate on partial vectors because"
8347                              " no conditional operation is available.\n");
8348           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8349         }
8350       else if (reduction_type == FOLD_LEFT_REDUCTION
8351                && reduc_fn == IFN_LAST
8352                && !expand_vec_cond_expr_p (vectype_in,
8353                                            truth_type_for (vectype_in),
8354                                            SSA_NAME))
8355         {
8356           if (dump_enabled_p ())
8357             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8358                              "can't operate on partial vectors because"
8359                              " no conditional operation is available.\n");
8360           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8361         }
8362       else if (reduction_type == FOLD_LEFT_REDUCTION
8363                && internal_fn_mask_index (reduc_fn) == -1
8364                && FLOAT_TYPE_P (vectype_in)
8365                && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in))
8366         {
8367           if (dump_enabled_p ())
8368             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8369                              "can't operate on partial vectors because"
8370                              " signed zeros cannot be preserved.\n");
8371           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8372         }
8373       else
8374         {
8375           internal_fn mask_reduc_fn
8376             = get_masked_reduction_fn (reduc_fn, vectype_in);
8377
8378           if (mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
8379             vect_record_loop_len (loop_vinfo, lens, ncopies * vec_num,
8380                                   vectype_in, 1);
8381           else
8382             vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
8383                                    vectype_in, NULL);
8384         }
8385     }
8386   return true;
8387 }
8388
8389 /* STMT_INFO is a dot-product reduction whose multiplication operands
8390    have different signs.  Emit a sequence to emulate the operation
8391    using a series of signed DOT_PROD_EXPRs and return the last
8392    statement generated.  VEC_DEST is the result of the vector operation
8393    and VOP lists its inputs.  */
8394
8395 static gassign *
8396 vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8397                              gimple_stmt_iterator *gsi, tree vec_dest,
8398                              tree vop[3])
8399 {
8400   tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest));
8401   tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0]));
8402   tree narrow_elttype = TREE_TYPE (narrow_vectype);
8403   gimple *new_stmt;
8404
8405   /* Make VOP[0] the unsigned operand VOP[1] the signed operand.  */
8406   if (!TYPE_UNSIGNED (TREE_TYPE (vop[0])))
8407     std::swap (vop[0], vop[1]);
8408
8409   /* Convert all inputs to signed types.  */
8410   for (int i = 0; i < 3; ++i)
8411     if (TYPE_UNSIGNED (TREE_TYPE (vop[i])))
8412       {
8413         tree tmp = make_ssa_name (signed_type_for (TREE_TYPE (vop[i])));
8414         new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]);
8415         vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8416         vop[i] = tmp;
8417       }
8418
8419   /* In the comments below we assume 8-bit inputs for simplicity,
8420      but the approach works for any full integer type.  */
8421
8422   /* Create a vector of -128.  */
8423   tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype);
8424   tree min_narrow = build_vector_from_val (narrow_vectype,
8425                                            min_narrow_elttype);
8426
8427   /* Create a vector of 64.  */
8428   auto half_wi = wi::lrshift (wi::to_wide (min_narrow_elttype), 1);
8429   tree half_narrow = wide_int_to_tree (narrow_elttype, half_wi);
8430   half_narrow = build_vector_from_val (narrow_vectype, half_narrow);
8431
8432   /* Emit: SUB_RES = VOP[0] - 128.  */
8433   tree sub_res = make_ssa_name (narrow_vectype);
8434   new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow);
8435   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8436
8437   /* Emit:
8438
8439        STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
8440        STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
8441        STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
8442
8443      on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
8444      Doing the two 64 * y steps first allows more time to compute x.  */
8445   tree stage1 = make_ssa_name (wide_vectype);
8446   new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR,
8447                                   vop[1], half_narrow, vop[2]);
8448   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8449
8450   tree stage2 = make_ssa_name (wide_vectype);
8451   new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR,
8452                                   vop[1], half_narrow, stage1);
8453   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8454
8455   tree stage3 = make_ssa_name (wide_vectype);
8456   new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR,
8457                                   sub_res, vop[1], stage2);
8458   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8459
8460   /* Convert STAGE3 to the reduction type.  */
8461   return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3);
8462 }
8463
8464 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
8465    value.  */
8466
8467 bool
8468 vect_transform_reduction (loop_vec_info loop_vinfo,
8469                           stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
8470                           gimple **vec_stmt, slp_tree slp_node)
8471 {
8472   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8473   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8474   int i;
8475   int ncopies;
8476   int vec_num;
8477
8478   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8479   gcc_assert (reduc_info->is_reduc_info);
8480
8481   if (nested_in_vect_loop_p (loop, stmt_info))
8482     {
8483       loop = loop->inner;
8484       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
8485     }
8486
8487   gimple_match_op op;
8488   if (!gimple_extract_op (stmt_info->stmt, &op))
8489     gcc_unreachable ();
8490
8491   /* All uses but the last are expected to be defined in the loop.
8492      The last use is the reduction variable.  In case of nested cycle this
8493      assumption is not true: we use reduc_index to record the index of the
8494      reduction variable.  */
8495   stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
8496   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
8497   int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
8498   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8499
8500   if (slp_node)
8501     {
8502       ncopies = 1;
8503       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8504     }
8505   else
8506     {
8507       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8508       vec_num = 1;
8509     }
8510
8511   code_helper code = canonicalize_code (op.code, op.type);
8512   internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
8513
8514   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8515   vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8516   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
8517
8518   /* Transform.  */
8519   tree new_temp = NULL_TREE;
8520   auto_vec<tree> vec_oprnds0;
8521   auto_vec<tree> vec_oprnds1;
8522   auto_vec<tree> vec_oprnds2;
8523   tree def0;
8524
8525   if (dump_enabled_p ())
8526     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
8527
8528   /* FORNOW: Multiple types are not supported for condition.  */
8529   if (code == COND_EXPR)
8530     gcc_assert (ncopies == 1);
8531
8532   /* A binary COND_OP reduction must have the same definition and else
8533      value. */
8534   bool cond_fn_p = code.is_internal_fn ()
8535     && conditional_internal_fn_code (internal_fn (code)) != ERROR_MARK;
8536   if (cond_fn_p)
8537     {
8538       gcc_assert (code == IFN_COND_ADD || code == IFN_COND_SUB
8539                   || code == IFN_COND_MUL || code == IFN_COND_AND
8540                   || code == IFN_COND_IOR || code == IFN_COND_XOR);
8541       gcc_assert (op.num_ops == 4
8542                   && (op.ops[reduc_index]
8543                       == op.ops[internal_fn_else_index ((internal_fn) code)]));
8544     }
8545
8546   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8547
8548   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
8549   if (reduction_type == FOLD_LEFT_REDUCTION)
8550     {
8551       internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
8552       gcc_assert (code.is_tree_code () || cond_fn_p);
8553       return vectorize_fold_left_reduction
8554           (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
8555            code, reduc_fn, op.ops, op.num_ops, vectype_in,
8556            reduc_index, masks, lens);
8557     }
8558
8559   bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
8560   gcc_assert (single_defuse_cycle
8561               || code == DOT_PROD_EXPR
8562               || code == WIDEN_SUM_EXPR
8563               || code == SAD_EXPR);
8564
8565   /* Create the destination vector  */
8566   tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
8567   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
8568
8569   /* Get NCOPIES vector definitions for all operands except the reduction
8570      definition.  */
8571   if (!cond_fn_p)
8572     {
8573       vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
8574                          single_defuse_cycle && reduc_index == 0
8575                          ? NULL_TREE : op.ops[0], &vec_oprnds0,
8576                          single_defuse_cycle && reduc_index == 1
8577                          ? NULL_TREE : op.ops[1], &vec_oprnds1,
8578                          op.num_ops == 3
8579                          && !(single_defuse_cycle && reduc_index == 2)
8580                          ? op.ops[2] : NULL_TREE, &vec_oprnds2);
8581     }
8582   else
8583     {
8584       /* For a conditional operation pass the truth type as mask
8585          vectype.  */
8586       gcc_assert (single_defuse_cycle
8587                   && (reduc_index == 1 || reduc_index == 2));
8588       vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
8589                          op.ops[0], truth_type_for (vectype_in), &vec_oprnds0,
8590                          reduc_index == 1 ? NULL_TREE : op.ops[1],
8591                          NULL_TREE, &vec_oprnds1,
8592                          reduc_index == 2 ? NULL_TREE : op.ops[2],
8593                          NULL_TREE, &vec_oprnds2);
8594     }
8595
8596   /* For single def-use cycles get one copy of the vectorized reduction
8597      definition.  */
8598   if (single_defuse_cycle)
8599     {
8600       gcc_assert (!slp_node);
8601       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8602                                      op.ops[reduc_index],
8603                                      reduc_index == 0 ? &vec_oprnds0
8604                                      : (reduc_index == 1 ? &vec_oprnds1
8605                                         : &vec_oprnds2));
8606     }
8607
8608   bool emulated_mixed_dot_prod
8609     = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
8610   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
8611     {
8612       gimple *new_stmt;
8613       tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
8614       if (masked_loop_p && !mask_by_cond_expr)
8615         {
8616           /* No conditional ifns have been defined for dot-product yet.  */
8617           gcc_assert (code != DOT_PROD_EXPR);
8618
8619           /* Make sure that the reduction accumulator is vop[0].  */
8620           if (reduc_index == 1)
8621             {
8622               gcc_assert (commutative_binary_op_p (code, op.type));
8623               std::swap (vop[0], vop[1]);
8624             }
8625           tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8626                                           vec_num * ncopies, vectype_in, i);
8627           gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
8628                                                     vop[0], vop[1], vop[0]);
8629           new_temp = make_ssa_name (vec_dest, call);
8630           gimple_call_set_lhs (call, new_temp);
8631           gimple_call_set_nothrow (call, true);
8632           vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
8633           new_stmt = call;
8634         }
8635       else
8636         {
8637           if (op.num_ops >= 3)
8638             vop[2] = vec_oprnds2[i];
8639
8640           if (masked_loop_p && mask_by_cond_expr)
8641             {
8642               tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8643                                               vec_num * ncopies, vectype_in, i);
8644               build_vect_cond_expr (code, vop, mask, gsi);
8645             }
8646
8647           if (emulated_mixed_dot_prod)
8648             new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
8649                                                     vec_dest, vop);
8650
8651           else if (code.is_internal_fn () && !cond_fn_p)
8652             new_stmt = gimple_build_call_internal (internal_fn (code),
8653                                                    op.num_ops,
8654                                                    vop[0], vop[1], vop[2]);
8655           else if (code.is_internal_fn () && cond_fn_p)
8656             new_stmt = gimple_build_call_internal (internal_fn (code),
8657                                                    op.num_ops,
8658                                                    vop[0], vop[1], vop[2],
8659                                                    vop[1]);
8660           else
8661             new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
8662                                             vop[0], vop[1], vop[2]);
8663           new_temp = make_ssa_name (vec_dest, new_stmt);
8664           gimple_set_lhs (new_stmt, new_temp);
8665           vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8666         }
8667
8668       if (slp_node)
8669         slp_node->push_vec_def (new_stmt);
8670       else if (single_defuse_cycle
8671                && i < ncopies - 1)
8672         {
8673           if (reduc_index == 0)
8674             vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
8675           else if (reduc_index == 1)
8676             vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
8677           else if (reduc_index == 2)
8678             vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
8679         }
8680       else
8681         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8682     }
8683
8684   if (!slp_node)
8685     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8686
8687   return true;
8688 }
8689
8690 /* Transform phase of a cycle PHI.  */
8691
8692 bool
8693 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
8694                           stmt_vec_info stmt_info, gimple **vec_stmt,
8695                           slp_tree slp_node, slp_instance slp_node_instance)
8696 {
8697   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8698   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8699   int i;
8700   int ncopies;
8701   int j;
8702   bool nested_cycle = false;
8703   int vec_num;
8704
8705   if (nested_in_vect_loop_p (loop, stmt_info))
8706     {
8707       loop = loop->inner;
8708       nested_cycle = true;
8709     }
8710
8711   stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
8712   reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
8713   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8714   gcc_assert (reduc_info->is_reduc_info);
8715
8716   if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
8717       || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
8718     /* Leave the scalar phi in place.  */
8719     return true;
8720
8721   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8722   /* For a nested cycle we do not fill the above.  */
8723   if (!vectype_in)
8724     vectype_in = STMT_VINFO_VECTYPE (stmt_info);
8725   gcc_assert (vectype_in);
8726
8727   if (slp_node)
8728     {
8729       /* The size vect_schedule_slp_instance computes is off for us.  */
8730       vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
8731                                       * SLP_TREE_LANES (slp_node), vectype_in);
8732       ncopies = 1;
8733     }
8734   else
8735     {
8736       vec_num = 1;
8737       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8738     }
8739
8740   /* Check whether we should use a single PHI node and accumulate
8741      vectors to one before the backedge.  */
8742   if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
8743     ncopies = 1;
8744
8745   /* Create the destination vector  */
8746   gphi *phi = as_a <gphi *> (stmt_info->stmt);
8747   tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
8748                                                vectype_out);
8749
8750   /* Get the loop-entry arguments.  */
8751   tree vec_initial_def = NULL_TREE;
8752   auto_vec<tree> vec_initial_defs;
8753   if (slp_node)
8754     {
8755       vec_initial_defs.reserve (vec_num);
8756       if (nested_cycle)
8757         {
8758           unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
8759           vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
8760                              &vec_initial_defs);
8761         }
8762       else
8763         {
8764           gcc_assert (slp_node == slp_node_instance->reduc_phis);
8765           vec<tree> &initial_values = reduc_info->reduc_initial_values;
8766           vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
8767
8768           unsigned int num_phis = stmts.length ();
8769           if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
8770             num_phis = 1;
8771           initial_values.reserve (num_phis);
8772           for (unsigned int i = 0; i < num_phis; ++i)
8773             {
8774               gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
8775               initial_values.quick_push (vect_phi_initial_value (this_phi));
8776             }
8777           if (vec_num == 1)
8778             vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8779           if (!initial_values.is_empty ())
8780             {
8781               tree initial_value
8782                 = (num_phis == 1 ? initial_values[0] : NULL_TREE);
8783               code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8784               tree neutral_op
8785                 = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8786                                             code, initial_value);
8787               get_initial_defs_for_reduction (loop_vinfo, reduc_info,
8788                                               &vec_initial_defs, vec_num,
8789                                               stmts.length (), neutral_op);
8790             }
8791         }
8792     }
8793   else
8794     {
8795       /* Get at the scalar def before the loop, that defines the initial
8796          value of the reduction variable.  */
8797       tree initial_def = vect_phi_initial_value (phi);
8798       reduc_info->reduc_initial_values.safe_push (initial_def);
8799       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
8800          and we can't use zero for induc_val, use initial_def.  Similarly
8801          for REDUC_MIN and initial_def larger than the base.  */
8802       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
8803         {
8804           tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
8805           if (TREE_CODE (initial_def) == INTEGER_CST
8806               && !integer_zerop (induc_val)
8807               && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
8808                    && tree_int_cst_lt (initial_def, induc_val))
8809                   || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
8810                       && tree_int_cst_lt (induc_val, initial_def))))
8811             {
8812               induc_val = initial_def;
8813               /* Communicate we used the initial_def to epilouge
8814                  generation.  */
8815               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
8816             }
8817           vec_initial_def = build_vector_from_val (vectype_out, induc_val);
8818         }
8819       else if (nested_cycle)
8820         {
8821           /* Do not use an adjustment def as that case is not supported
8822              correctly if ncopies is not one.  */
8823           vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
8824                                          ncopies, initial_def,
8825                                          &vec_initial_defs);
8826         }
8827       else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
8828                || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
8829         /* Fill the initial vector with the initial scalar value.  */
8830         vec_initial_def
8831           = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
8832                                            initial_def, initial_def);
8833       else
8834         {
8835           if (ncopies == 1)
8836             vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8837           if (!reduc_info->reduc_initial_values.is_empty ())
8838             {
8839               initial_def = reduc_info->reduc_initial_values[0];
8840               code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8841               tree neutral_op
8842                 = neutral_op_for_reduction (TREE_TYPE (initial_def),
8843                                             code, initial_def);
8844               gcc_assert (neutral_op);
8845               /* Try to simplify the vector initialization by applying an
8846                  adjustment after the reduction has been performed.  */
8847               if (!reduc_info->reused_accumulator
8848                   && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8849                   && !operand_equal_p (neutral_op, initial_def))
8850                 {
8851                   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
8852                     = initial_def;
8853                   initial_def = neutral_op;
8854                 }
8855               vec_initial_def
8856                 = get_initial_def_for_reduction (loop_vinfo, reduc_info,
8857                                                  initial_def, neutral_op);
8858             }
8859         }
8860     }
8861
8862   if (vec_initial_def)
8863     {
8864       vec_initial_defs.create (ncopies);
8865       for (i = 0; i < ncopies; ++i)
8866         vec_initial_defs.quick_push (vec_initial_def);
8867     }
8868
8869   if (auto *accumulator = reduc_info->reused_accumulator)
8870     {
8871       tree def = accumulator->reduc_input;
8872       if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8873         {
8874           unsigned int nreduc;
8875           bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
8876                                             (TREE_TYPE (def)),
8877                                           TYPE_VECTOR_SUBPARTS (vectype_out),
8878                                           &nreduc);
8879           gcc_assert (res);
8880           gimple_seq stmts = NULL;
8881           /* Reduce the single vector to a smaller one.  */
8882           if (nreduc != 1)
8883             {
8884               /* Perform the reduction in the appropriate type.  */
8885               tree rvectype = vectype_out;
8886               if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
8887                                               TREE_TYPE (TREE_TYPE (def))))
8888                 rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
8889                                               TYPE_VECTOR_SUBPARTS
8890                                                 (vectype_out));
8891               def = vect_create_partial_epilog (def, rvectype,
8892                                                 STMT_VINFO_REDUC_CODE
8893                                                   (reduc_info),
8894                                                 &stmts);
8895             }
8896           /* The epilogue loop might use a different vector mode, like
8897              VNx2DI vs. V2DI.  */
8898           if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
8899             {
8900               tree reduc_type = build_vector_type_for_mode
8901                 (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
8902               def = gimple_convert (&stmts, reduc_type, def);
8903             }
8904           /* Adjust the input so we pick up the partially reduced value
8905              for the skip edge in vect_create_epilog_for_reduction.  */
8906           accumulator->reduc_input = def;
8907           /* And the reduction could be carried out using a different sign.  */
8908           if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8909             def = gimple_convert (&stmts, vectype_out, def);
8910           if (loop_vinfo->main_loop_edge)
8911             {
8912               /* While we'd like to insert on the edge this will split
8913                  blocks and disturb bookkeeping, we also will eventually
8914                  need this on the skip edge.  Rely on sinking to
8915                  fixup optimal placement and insert in the pred.  */
8916               gimple_stmt_iterator gsi
8917                 = gsi_last_bb (loop_vinfo->main_loop_edge->src);
8918               /* Insert before a cond that eventually skips the
8919                  epilogue.  */
8920               if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
8921                 gsi_prev (&gsi);
8922               gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
8923             }
8924           else
8925             gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
8926                                               stmts);
8927         }
8928       if (loop_vinfo->main_loop_edge)
8929         vec_initial_defs[0]
8930           = vect_get_main_loop_result (loop_vinfo, def,
8931                                        vec_initial_defs[0]);
8932       else
8933         vec_initial_defs.safe_push (def);
8934     }
8935
8936   /* Generate the reduction PHIs upfront.  */
8937   for (i = 0; i < vec_num; i++)
8938     {
8939       tree vec_init_def = vec_initial_defs[i];
8940       for (j = 0; j < ncopies; j++)
8941         {
8942           /* Create the reduction-phi that defines the reduction
8943              operand.  */
8944           gphi *new_phi = create_phi_node (vec_dest, loop->header);
8945
8946           /* Set the loop-entry arg of the reduction-phi.  */
8947           if (j != 0 && nested_cycle)
8948             vec_init_def = vec_initial_defs[j];
8949           add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
8950                        UNKNOWN_LOCATION);
8951
8952           /* The loop-latch arg is set in epilogue processing.  */
8953
8954           if (slp_node)
8955             slp_node->push_vec_def (new_phi);
8956           else
8957             {
8958               if (j == 0)
8959                 *vec_stmt = new_phi;
8960               STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8961             }
8962         }
8963     }
8964
8965   return true;
8966 }
8967
8968 /* Vectorizes LC PHIs.  */
8969
8970 bool
8971 vectorizable_lc_phi (loop_vec_info loop_vinfo,
8972                      stmt_vec_info stmt_info, gimple **vec_stmt,
8973                      slp_tree slp_node)
8974 {
8975   if (!loop_vinfo
8976       || !is_a <gphi *> (stmt_info->stmt)
8977       || gimple_phi_num_args (stmt_info->stmt) != 1)
8978     return false;
8979
8980   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
8981       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
8982     return false;
8983
8984   if (!vec_stmt) /* transformation not required.  */
8985     {
8986       /* Deal with copies from externs or constants that disguise as
8987          loop-closed PHI nodes (PR97886).  */
8988       if (slp_node
8989           && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
8990                                                 SLP_TREE_VECTYPE (slp_node)))
8991         {
8992           if (dump_enabled_p ())
8993             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8994                              "incompatible vector types for invariants\n");
8995           return false;
8996         }
8997       STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
8998       return true;
8999     }
9000
9001   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9002   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
9003   basic_block bb = gimple_bb (stmt_info->stmt);
9004   edge e = single_pred_edge (bb);
9005   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
9006   auto_vec<tree> vec_oprnds;
9007   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
9008                      !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
9009                      gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
9010   for (unsigned i = 0; i < vec_oprnds.length (); i++)
9011     {
9012       /* Create the vectorized LC PHI node.  */
9013       gphi *new_phi = create_phi_node (vec_dest, bb);
9014       add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
9015       if (slp_node)
9016         slp_node->push_vec_def (new_phi);
9017       else
9018         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
9019     }
9020   if (!slp_node)
9021     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
9022
9023   return true;
9024 }
9025
9026 /* Vectorizes PHIs.  */
9027
9028 bool
9029 vectorizable_phi (vec_info *,
9030                   stmt_vec_info stmt_info, gimple **vec_stmt,
9031                   slp_tree slp_node, stmt_vector_for_cost *cost_vec)
9032 {
9033   if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
9034     return false;
9035
9036   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
9037     return false;
9038
9039   tree vectype = SLP_TREE_VECTYPE (slp_node);
9040
9041   if (!vec_stmt) /* transformation not required.  */
9042     {
9043       slp_tree child;
9044       unsigned i;
9045       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
9046         if (!child)
9047           {
9048             if (dump_enabled_p ())
9049               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9050                                "PHI node with unvectorized backedge def\n");
9051             return false;
9052           }
9053         else if (!vect_maybe_update_slp_op_vectype (child, vectype))
9054           {
9055             if (dump_enabled_p ())
9056               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9057                                "incompatible vector types for invariants\n");
9058             return false;
9059           }
9060         else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
9061                  && !useless_type_conversion_p (vectype,
9062                                                 SLP_TREE_VECTYPE (child)))
9063           {
9064             /* With bools we can have mask and non-mask precision vectors
9065                or different non-mask precisions.  while pattern recog is
9066                supposed to guarantee consistency here bugs in it can cause
9067                mismatches (PR103489 and PR103800 for example).
9068                Deal with them here instead of ICEing later.  */
9069             if (dump_enabled_p ())
9070               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9071                                "incompatible vector type setup from "
9072                                "bool pattern detection\n");
9073             return false;
9074           }
9075
9076       /* For single-argument PHIs assume coalescing which means zero cost
9077          for the scalar and the vector PHIs.  This avoids artificially
9078          favoring the vector path (but may pessimize it in some cases).  */
9079       if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
9080         record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
9081                           vector_stmt, stmt_info, vectype, 0, vect_body);
9082       STMT_VINFO_TYPE (stmt_info) = phi_info_type;
9083       return true;
9084     }
9085
9086   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
9087   basic_block bb = gimple_bb (stmt_info->stmt);
9088   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
9089   auto_vec<gphi *> new_phis;
9090   for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
9091     {
9092       slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
9093
9094       /* Skip not yet vectorized defs.  */
9095       if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
9096           && SLP_TREE_VEC_DEFS (child).is_empty ())
9097         continue;
9098
9099       auto_vec<tree> vec_oprnds;
9100       vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
9101       if (!new_phis.exists ())
9102         {
9103           new_phis.create (vec_oprnds.length ());
9104           for (unsigned j = 0; j < vec_oprnds.length (); j++)
9105             {
9106               /* Create the vectorized LC PHI node.  */
9107               new_phis.quick_push (create_phi_node (vec_dest, bb));
9108               slp_node->push_vec_def (new_phis[j]);
9109             }
9110         }
9111       edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
9112       for (unsigned j = 0; j < vec_oprnds.length (); j++)
9113         add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
9114     }
9115   /* We should have at least one already vectorized child.  */
9116   gcc_assert (new_phis.exists ());
9117
9118   return true;
9119 }
9120
9121 /* Vectorizes first order recurrences.  An overview of the transformation
9122    is described below. Suppose we have the following loop.
9123
9124      int t = 0;
9125      for (int i = 0; i < n; ++i)
9126        {
9127          b[i] = a[i] - t;
9128          t = a[i];
9129        }
9130
9131    There is a first-order recurrence on 'a'. For this loop, the scalar IR
9132    looks (simplified) like:
9133
9134     scalar.preheader:
9135       init = 0;
9136
9137     scalar.body:
9138       i = PHI <0(scalar.preheader), i+1(scalar.body)>
9139       _2 = PHI <(init(scalar.preheader), <_1(scalar.body)>
9140       _1 = a[i]
9141       b[i] = _1 - _2
9142       if (i < n) goto scalar.body
9143
9144    In this example, _2 is a recurrence because it's value depends on the
9145    previous iteration.  We vectorize this as (VF = 4)
9146
9147     vector.preheader:
9148       vect_init = vect_cst(..., ..., ..., 0)
9149
9150     vector.body
9151       i = PHI <0(vector.preheader), i+4(vector.body)>
9152       vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)>
9153       vect_2 = a[i, i+1, i+2, i+3];
9154       vect_3 = vec_perm (vect_1, vect_2, { 3, 4, 5, 6 })
9155       b[i, i+1, i+2, i+3] = vect_2 - vect_3
9156       if (..) goto vector.body
9157
9158    In this function, vectorizable_recurr, we code generate both the
9159    vector PHI node and the permute since those together compute the
9160    vectorized value of the scalar PHI.  We do not yet have the
9161    backedge value to fill in there nor into the vec_perm.  Those
9162    are filled in maybe_set_vectorized_backedge_value and
9163    vect_schedule_scc.
9164
9165    TODO:  Since the scalar loop does not have a use of the recurrence
9166    outside of the loop the natural way to implement peeling via
9167    vectorizing the live value doesn't work.  For now peeling of loops
9168    with a recurrence is not implemented.  For SLP the supported cases
9169    are restricted to those requiring a single vector recurrence PHI.  */
9170
9171 bool
9172 vectorizable_recurr (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
9173                      gimple **vec_stmt, slp_tree slp_node,
9174                      stmt_vector_for_cost *cost_vec)
9175 {
9176   if (!loop_vinfo || !is_a<gphi *> (stmt_info->stmt))
9177     return false;
9178
9179   gphi *phi = as_a<gphi *> (stmt_info->stmt);
9180
9181   /* So far we only support first-order recurrence auto-vectorization.  */
9182   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
9183     return false;
9184
9185   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9186   unsigned ncopies;
9187   if (slp_node)
9188     ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
9189   else
9190     ncopies = vect_get_num_copies (loop_vinfo, vectype);
9191   poly_int64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9192   unsigned dist = slp_node ? SLP_TREE_LANES (slp_node) : 1;
9193   /* We need to be able to make progress with a single vector.  */
9194   if (maybe_gt (dist * 2, nunits))
9195     {
9196       if (dump_enabled_p ())
9197         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9198                          "first order recurrence exceeds half of "
9199                          "a vector\n");
9200       return false;
9201     }
9202
9203   /* First-order recurrence autovectorization needs to handle permutation
9204      with indices = [nunits-1, nunits, nunits+1, ...].  */
9205   vec_perm_builder sel (nunits, 1, 3);
9206   for (int i = 0; i < 3; ++i)
9207     sel.quick_push (nunits - dist + i);
9208   vec_perm_indices indices (sel, 2, nunits);
9209
9210   if (!vec_stmt) /* transformation not required.  */
9211     {
9212       if (!can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
9213                                  indices))
9214         return false;
9215
9216       if (slp_node)
9217         {
9218           /* We eventually need to set a vector type on invariant
9219              arguments.  */
9220           unsigned j;
9221           slp_tree child;
9222           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9223             if (!vect_maybe_update_slp_op_vectype
9224                   (child, SLP_TREE_VECTYPE (slp_node)))
9225               {
9226                 if (dump_enabled_p ())
9227                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9228                                    "incompatible vector types for "
9229                                    "invariants\n");
9230                 return false;
9231               }
9232         }
9233       /* The recurrence costs the initialization vector and one permute
9234          for each copy.  */
9235       unsigned prologue_cost = record_stmt_cost (cost_vec, 1, scalar_to_vec,
9236                                                  stmt_info, 0, vect_prologue);
9237       unsigned inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9238                                                stmt_info, 0, vect_body);
9239       if (dump_enabled_p ())
9240         dump_printf_loc (MSG_NOTE, vect_location,
9241                          "vectorizable_recurr: inside_cost = %d, "
9242                          "prologue_cost = %d .\n", inside_cost,
9243                          prologue_cost);
9244
9245       STMT_VINFO_TYPE (stmt_info) = recurr_info_type;
9246       return true;
9247     }
9248
9249   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
9250   basic_block bb = gimple_bb (phi);
9251   tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
9252   if (!useless_type_conversion_p (TREE_TYPE (vectype), TREE_TYPE (preheader)))
9253     {
9254       gimple_seq stmts = NULL;
9255       preheader = gimple_convert (&stmts, TREE_TYPE (vectype), preheader);
9256       gsi_insert_seq_on_edge_immediate (pe, stmts);
9257     }
9258   tree vec_init = build_vector_from_val (vectype, preheader);
9259   vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL);
9260
9261   /* Create the vectorized first-order PHI node.  */
9262   tree vec_dest = vect_get_new_vect_var (vectype,
9263                                          vect_simple_var, "vec_recur_");
9264   gphi *new_phi = create_phi_node (vec_dest, bb);
9265   add_phi_arg (new_phi, vec_init, pe, UNKNOWN_LOCATION);
9266
9267   /* Insert shuffles the first-order recurrence autovectorization.
9268        result = VEC_PERM <vec_recur, vect_1, index[nunits-1, nunits, ...]>.  */
9269   tree perm = vect_gen_perm_mask_checked (vectype, indices);
9270
9271   /* Insert the required permute after the latch definition.  The
9272      second and later operands are tentative and will be updated when we have
9273      vectorized the latch definition.  */
9274   edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
9275   gimple *latch_def = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, le));
9276   gimple_stmt_iterator gsi2 = gsi_for_stmt (latch_def);
9277   gsi_next (&gsi2);
9278
9279   for (unsigned i = 0; i < ncopies; ++i)
9280     {
9281       vec_dest = make_ssa_name (vectype);
9282       gassign *vperm
9283           = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
9284                                  i == 0 ? gimple_phi_result (new_phi) : NULL,
9285                                  NULL, perm);
9286       vect_finish_stmt_generation (loop_vinfo, stmt_info, vperm, &gsi2);
9287
9288       if (slp_node)
9289         slp_node->push_vec_def (vperm);
9290       else
9291         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (vperm);
9292     }
9293
9294   if (!slp_node)
9295     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
9296   return true;
9297 }
9298
9299 /* Return true if VECTYPE represents a vector that requires lowering
9300    by the vector lowering pass.  */
9301
9302 bool
9303 vect_emulated_vector_p (tree vectype)
9304 {
9305   return (!VECTOR_MODE_P (TYPE_MODE (vectype))
9306           && (!VECTOR_BOOLEAN_TYPE_P (vectype)
9307               || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
9308 }
9309
9310 /* Return true if we can emulate CODE on an integer mode representation
9311    of a vector.  */
9312
9313 bool
9314 vect_can_vectorize_without_simd_p (tree_code code)
9315 {
9316   switch (code)
9317     {
9318     case PLUS_EXPR:
9319     case MINUS_EXPR:
9320     case NEGATE_EXPR:
9321     case BIT_AND_EXPR:
9322     case BIT_IOR_EXPR:
9323     case BIT_XOR_EXPR:
9324     case BIT_NOT_EXPR:
9325       return true;
9326
9327     default:
9328       return false;
9329     }
9330 }
9331
9332 /* Likewise, but taking a code_helper.  */
9333
9334 bool
9335 vect_can_vectorize_without_simd_p (code_helper code)
9336 {
9337   return (code.is_tree_code ()
9338           && vect_can_vectorize_without_simd_p (tree_code (code)));
9339 }
9340
9341 /* Create vector init for vectorized iv.  */
9342 static tree
9343 vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9344                                tree step_expr, poly_uint64 nunits,
9345                                tree vectype,
9346                                enum vect_induction_op_type induction_type)
9347 {
9348   unsigned HOST_WIDE_INT const_nunits;
9349   tree vec_shift, vec_init, new_name;
9350   unsigned i;
9351   tree itype = TREE_TYPE (vectype);
9352
9353   /* iv_loop is the loop to be vectorized. Create:
9354      vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr).  */
9355   new_name = gimple_convert (stmts, itype, init_expr);
9356   switch (induction_type)
9357     {
9358     case vect_step_op_shr:
9359     case vect_step_op_shl:
9360       /* Build the Initial value from shift_expr.  */
9361       vec_init = gimple_build_vector_from_val (stmts,
9362                                                vectype,
9363                                                new_name);
9364       vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
9365                                 build_zero_cst (itype), step_expr);
9366       vec_init = gimple_build (stmts,
9367                                (induction_type == vect_step_op_shr
9368                                 ? RSHIFT_EXPR : LSHIFT_EXPR),
9369                                vectype, vec_init, vec_shift);
9370       break;
9371
9372     case vect_step_op_neg:
9373       {
9374         vec_init = gimple_build_vector_from_val (stmts,
9375                                                  vectype,
9376                                                  new_name);
9377         tree vec_neg = gimple_build (stmts, NEGATE_EXPR,
9378                                      vectype, vec_init);
9379         /* The encoding has 2 interleaved stepped patterns.  */
9380         vec_perm_builder sel (nunits, 2, 3);
9381         sel.quick_grow (6);
9382         for (i = 0; i < 3; i++)
9383           {
9384             sel[2 * i] = i;
9385             sel[2 * i + 1] = i + nunits;
9386           }
9387         vec_perm_indices indices (sel, 2, nunits);
9388         /* Don't use vect_gen_perm_mask_checked since can_vec_perm_const_p may
9389            fail when vec_init is const vector. In that situation vec_perm is not
9390            really needed.  */
9391         tree perm_mask_even
9392           = vect_gen_perm_mask_any (vectype, indices);
9393         vec_init = gimple_build (stmts, VEC_PERM_EXPR,
9394                                  vectype,
9395                                  vec_init, vec_neg,
9396                                  perm_mask_even);
9397       }
9398       break;
9399
9400     case vect_step_op_mul:
9401       {
9402         /* Use unsigned mult to avoid UD integer overflow.  */
9403         gcc_assert (nunits.is_constant (&const_nunits));
9404         tree utype = unsigned_type_for (itype);
9405         tree uvectype = build_vector_type (utype,
9406                                            TYPE_VECTOR_SUBPARTS (vectype));
9407         new_name = gimple_convert (stmts, utype, new_name);
9408         vec_init = gimple_build_vector_from_val (stmts,
9409                                                  uvectype,
9410                                                  new_name);
9411         tree_vector_builder elts (uvectype, const_nunits, 1);
9412         tree elt_step = build_one_cst (utype);
9413
9414         elts.quick_push (elt_step);
9415         for (i = 1; i < const_nunits; i++)
9416           {
9417             /* Create: new_name_i = new_name + step_expr.  */
9418             elt_step = gimple_build (stmts, MULT_EXPR,
9419                                      utype, elt_step, step_expr);
9420             elts.quick_push (elt_step);
9421           }
9422         /* Create a vector from [new_name_0, new_name_1, ...,
9423            new_name_nunits-1].  */
9424         tree vec_mul = gimple_build_vector (stmts, &elts);
9425         vec_init = gimple_build (stmts, MULT_EXPR, uvectype,
9426                                  vec_init, vec_mul);
9427         vec_init = gimple_convert (stmts, vectype, vec_init);
9428       }
9429       break;
9430
9431     default:
9432       gcc_unreachable ();
9433     }
9434
9435   return vec_init;
9436 }
9437
9438 /* Peel init_expr by skip_niter for induction_type.  */
9439 tree
9440 vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9441                              tree skip_niters, tree step_expr,
9442                              enum vect_induction_op_type induction_type)
9443 {
9444   gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST);
9445   tree type = TREE_TYPE (init_expr);
9446   unsigned prec = TYPE_PRECISION (type);
9447   switch (induction_type)
9448     {
9449     case vect_step_op_neg:
9450       if (TREE_INT_CST_LOW (skip_niters) % 2)
9451         init_expr = gimple_build (stmts, NEGATE_EXPR, type, init_expr);
9452       /* else no change.  */
9453       break;
9454
9455     case vect_step_op_shr:
9456     case vect_step_op_shl:
9457       skip_niters = gimple_convert (stmts, type, skip_niters);
9458       step_expr = gimple_build (stmts, MULT_EXPR, type, step_expr, skip_niters);
9459       /* When shift mount >= precision, need to avoid UD.
9460          In the original loop, there's no UD, and according to semantic,
9461          init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr.  */
9462       if (!tree_fits_uhwi_p (step_expr)
9463           || tree_to_uhwi (step_expr) >= prec)
9464         {
9465           if (induction_type == vect_step_op_shl
9466               || TYPE_UNSIGNED (type))
9467             init_expr = build_zero_cst (type);
9468           else
9469             init_expr = gimple_build (stmts, RSHIFT_EXPR, type,
9470                                       init_expr,
9471                                       wide_int_to_tree (type, prec - 1));
9472         }
9473       else
9474         init_expr = gimple_build (stmts, (induction_type == vect_step_op_shr
9475                                           ? RSHIFT_EXPR : LSHIFT_EXPR),
9476                                   type, init_expr, step_expr);
9477       break;
9478
9479     case vect_step_op_mul:
9480       {
9481         tree utype = unsigned_type_for (type);
9482         init_expr = gimple_convert (stmts, utype, init_expr);
9483         wide_int skipn = wi::to_wide (skip_niters);
9484         wide_int begin = wi::to_wide (step_expr);
9485         auto_mpz base, exp, mod, res;
9486         wi::to_mpz (begin, base, TYPE_SIGN (type));
9487         wi::to_mpz (skipn, exp, UNSIGNED);
9488         mpz_ui_pow_ui (mod, 2, TYPE_PRECISION (type));
9489         mpz_powm (res, base, exp, mod);
9490         begin = wi::from_mpz (type, res, TYPE_SIGN (type));
9491         tree mult_expr = wide_int_to_tree (utype, begin);
9492         init_expr = gimple_build (stmts, MULT_EXPR, utype,
9493                                   init_expr, mult_expr);
9494         init_expr = gimple_convert (stmts, type, init_expr);
9495       }
9496       break;
9497
9498     default:
9499       gcc_unreachable ();
9500     }
9501
9502   return init_expr;
9503 }
9504
9505 /* Create vector step for vectorized iv.  */
9506 static tree
9507 vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
9508                                poly_uint64 vf,
9509                                enum vect_induction_op_type induction_type)
9510 {
9511   tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
9512   tree new_name = NULL;
9513   /* Step should be pow (step, vf) for mult induction.  */
9514   if (induction_type == vect_step_op_mul)
9515     {
9516       gcc_assert (vf.is_constant ());
9517       wide_int begin = wi::to_wide (step_expr);
9518
9519       for (unsigned i = 0; i != vf.to_constant () - 1; i++)
9520         begin = wi::mul (begin, wi::to_wide (step_expr));
9521
9522       new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin);
9523     }
9524   else if (induction_type == vect_step_op_neg)
9525     /* Do nothing.  */
9526     ;
9527   else
9528     new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr),
9529                              expr, step_expr);
9530   return new_name;
9531 }
9532
9533 static tree
9534 vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
9535                                    stmt_vec_info stmt_info,
9536                                    tree new_name, tree vectype,
9537                                    enum vect_induction_op_type induction_type)
9538 {
9539   /* No step is needed for neg induction.  */
9540   if (induction_type == vect_step_op_neg)
9541     return NULL;
9542
9543   tree t = unshare_expr (new_name);
9544   gcc_assert (CONSTANT_CLASS_P (new_name)
9545               || TREE_CODE (new_name) == SSA_NAME);
9546   tree new_vec = build_vector_from_val (vectype, t);
9547   tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
9548                                     new_vec, vectype, NULL);
9549   return vec_step;
9550 }
9551
9552 /* Update vectorized iv with vect_step, induc_def is init.  */
9553 static tree
9554 vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
9555                           tree induc_def, tree vec_step,
9556                           enum vect_induction_op_type induction_type)
9557 {
9558   tree vec_def = induc_def;
9559   switch (induction_type)
9560     {
9561     case vect_step_op_mul:
9562       {
9563         /* Use unsigned mult to avoid UD integer overflow.  */
9564         tree uvectype
9565           = build_vector_type (unsigned_type_for (TREE_TYPE (vectype)),
9566                                TYPE_VECTOR_SUBPARTS (vectype));
9567         vec_def = gimple_convert (stmts, uvectype, vec_def);
9568         vec_step = gimple_convert (stmts, uvectype, vec_step);
9569         vec_def = gimple_build (stmts, MULT_EXPR, uvectype,
9570                                 vec_def, vec_step);
9571         vec_def = gimple_convert (stmts, vectype, vec_def);
9572       }
9573       break;
9574
9575     case vect_step_op_shr:
9576       vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype,
9577                               vec_def, vec_step);
9578       break;
9579
9580     case vect_step_op_shl:
9581       vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype,
9582                               vec_def, vec_step);
9583       break;
9584     case vect_step_op_neg:
9585       vec_def = induc_def;
9586       /* Do nothing.  */
9587       break;
9588     default:
9589       gcc_unreachable ();
9590     }
9591
9592   return vec_def;
9593
9594 }
9595
9596 /* Function vectorizable_induction
9597
9598    Check if STMT_INFO performs an nonlinear induction computation that can be
9599    vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
9600    a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
9601    basic block.
9602    Return true if STMT_INFO is vectorizable in this way.  */
9603
9604 static bool
9605 vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
9606                                   stmt_vec_info stmt_info,
9607                                   gimple **vec_stmt, slp_tree slp_node,
9608                                   stmt_vector_for_cost *cost_vec)
9609 {
9610   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9611   unsigned ncopies;
9612   bool nested_in_vect_loop = false;
9613   class loop *iv_loop;
9614   tree vec_def;
9615   edge pe = loop_preheader_edge (loop);
9616   basic_block new_bb;
9617   tree vec_init, vec_step;
9618   tree new_name;
9619   gimple *new_stmt;
9620   gphi *induction_phi;
9621   tree induc_def, vec_dest;
9622   tree init_expr, step_expr;
9623   tree niters_skip;
9624   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9625   unsigned i;
9626   gimple_stmt_iterator si;
9627
9628   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9629
9630   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9631   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9632   enum vect_induction_op_type induction_type
9633     = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9634
9635   gcc_assert (induction_type > vect_step_op_add);
9636
9637   if (slp_node)
9638     ncopies = 1;
9639   else
9640     ncopies = vect_get_num_copies (loop_vinfo, vectype);
9641   gcc_assert (ncopies >= 1);
9642
9643   /* FORNOW. Only handle nonlinear induction in the same loop.  */
9644   if (nested_in_vect_loop_p (loop, stmt_info))
9645     {
9646       if (dump_enabled_p ())
9647         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9648                          "nonlinear induction in nested loop.\n");
9649       return false;
9650     }
9651
9652   iv_loop = loop;
9653   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9654
9655   /* TODO: Support slp for nonlinear iv. There should be separate vector iv
9656      update for each iv and a permutation to generate wanted vector iv.  */
9657   if (slp_node)
9658     {
9659       if (dump_enabled_p ())
9660         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9661                          "SLP induction not supported for nonlinear"
9662                          " induction.\n");
9663       return false;
9664     }
9665
9666   if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
9667     {
9668       if (dump_enabled_p ())
9669         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9670                          "floating point nonlinear induction vectorization"
9671                          " not supported.\n");
9672       return false;
9673     }
9674
9675   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9676   init_expr = vect_phi_initial_value (phi);
9677   gcc_assert (step_expr != NULL_TREE && init_expr != NULL
9678               && TREE_CODE (step_expr) == INTEGER_CST);
9679   /* step_expr should be aligned with init_expr,
9680      .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used.  */
9681   step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
9682
9683   if (TREE_CODE (init_expr) == INTEGER_CST)
9684     init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
9685   else if (!tree_nop_conversion_p (TREE_TYPE (vectype), TREE_TYPE (init_expr)))
9686     {
9687       /* INIT_EXPR could be a bit_field, bail out for such case.  */
9688       if (dump_enabled_p ())
9689         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9690                          "nonlinear induction vectorization failed:"
9691                          " component type of vectype is not a nop conversion"
9692                          " from type of init_expr.\n");
9693       return false;
9694     }
9695
9696   switch (induction_type)
9697     {
9698     case vect_step_op_neg:
9699       if (TREE_CODE (init_expr) != INTEGER_CST
9700           && TREE_CODE (init_expr) != REAL_CST)
9701         {
9702           /* Check for backend support of NEGATE_EXPR and vec_perm.  */
9703           if (!directly_supported_p (NEGATE_EXPR, vectype))
9704             return false;
9705
9706           /* The encoding has 2 interleaved stepped patterns.  */
9707           vec_perm_builder sel (nunits, 2, 3);
9708           machine_mode mode = TYPE_MODE (vectype);
9709           sel.quick_grow (6);
9710           for (i = 0; i < 3; i++)
9711             {
9712               sel[i * 2] = i;
9713               sel[i * 2 + 1] = i + nunits;
9714             }
9715           vec_perm_indices indices (sel, 2, nunits);
9716           if (!can_vec_perm_const_p (mode, mode, indices))
9717             return false;
9718         }
9719       break;
9720
9721     case vect_step_op_mul:
9722       {
9723         /* Check for backend support of MULT_EXPR.  */
9724         if (!directly_supported_p (MULT_EXPR, vectype))
9725           return false;
9726
9727         /* ?? How to construct vector step for variable number vector.
9728            [ 1, step, pow (step, 2), pow (step, 4), .. ].  */
9729         if (!vf.is_constant ())
9730           return false;
9731       }
9732       break;
9733
9734     case vect_step_op_shr:
9735       /* Check for backend support of RSHIFT_EXPR.  */
9736       if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
9737         return false;
9738
9739       /* Don't shift more than type precision to avoid UD.  */
9740       if (!tree_fits_uhwi_p (step_expr)
9741           || maybe_ge (nunits * tree_to_uhwi (step_expr),
9742                        TYPE_PRECISION (TREE_TYPE (init_expr))))
9743         return false;
9744       break;
9745
9746     case vect_step_op_shl:
9747       /* Check for backend support of RSHIFT_EXPR.  */
9748       if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
9749         return false;
9750
9751       /* Don't shift more than type precision to avoid UD.  */
9752       if (!tree_fits_uhwi_p (step_expr)
9753           || maybe_ge (nunits * tree_to_uhwi (step_expr),
9754                        TYPE_PRECISION (TREE_TYPE (init_expr))))
9755         return false;
9756
9757       break;
9758
9759     default:
9760       gcc_unreachable ();
9761     }
9762
9763   if (!vec_stmt) /* transformation not required.  */
9764     {
9765       unsigned inside_cost = 0, prologue_cost = 0;
9766       /* loop cost for vec_loop. Neg induction doesn't have any
9767          inside_cost.  */
9768       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9769                                       stmt_info, 0, vect_body);
9770
9771       /* loop cost for vec_loop. Neg induction doesn't have any
9772          inside_cost.  */
9773       if (induction_type == vect_step_op_neg)
9774         inside_cost = 0;
9775
9776       /* prologue cost for vec_init and vec_step.  */
9777       prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9778                                         stmt_info, 0, vect_prologue);
9779
9780       if (dump_enabled_p ())
9781         dump_printf_loc (MSG_NOTE, vect_location,
9782                          "vect_model_induction_cost: inside_cost = %d, "
9783                          "prologue_cost = %d. \n", inside_cost,
9784                          prologue_cost);
9785
9786       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9787       DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
9788       return true;
9789     }
9790
9791   /* Transform.  */
9792
9793   /* Compute a vector variable, initialized with the first VF values of
9794      the induction variable.  E.g., for an iv with IV_PHI='X' and
9795      evolution S, for a vector of 4 units, we want to compute:
9796      [X, X + S, X + 2*S, X + 3*S].  */
9797
9798   if (dump_enabled_p ())
9799     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9800
9801   pe = loop_preheader_edge (iv_loop);
9802   /* Find the first insertion point in the BB.  */
9803   basic_block bb = gimple_bb (phi);
9804   si = gsi_after_labels (bb);
9805
9806   gimple_seq stmts = NULL;
9807
9808   niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9809   /* If we are using the loop mask to "peel" for alignment then we need
9810      to adjust the start value here.  */
9811   if (niters_skip != NULL_TREE)
9812     init_expr = vect_peel_nonlinear_iv_init (&stmts, init_expr, niters_skip,
9813                                              step_expr, induction_type);
9814
9815   vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr,
9816                                             step_expr, nunits, vectype,
9817                                             induction_type);
9818   if (stmts)
9819     {
9820       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9821       gcc_assert (!new_bb);
9822     }
9823
9824   stmts = NULL;
9825   new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9826                                             vf, induction_type);
9827   if (stmts)
9828     {
9829       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9830       gcc_assert (!new_bb);
9831     }
9832
9833   vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9834                                                 new_name, vectype,
9835                                                 induction_type);
9836   /* Create the following def-use cycle:
9837      loop prolog:
9838      vec_init = ...
9839      vec_step = ...
9840      loop:
9841      vec_iv = PHI <vec_init, vec_loop>
9842      ...
9843      STMT
9844      ...
9845      vec_loop = vec_iv + vec_step;  */
9846
9847   /* Create the induction-phi that defines the induction-operand.  */
9848   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
9849   induction_phi = create_phi_node (vec_dest, iv_loop->header);
9850   induc_def = PHI_RESULT (induction_phi);
9851
9852   /* Create the iv update inside the loop.  */
9853   stmts = NULL;
9854   vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9855                                       induc_def, vec_step,
9856                                       induction_type);
9857
9858   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9859   new_stmt = SSA_NAME_DEF_STMT (vec_def);
9860
9861   /* Set the arguments of the phi node:  */
9862   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9863   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9864                UNKNOWN_LOCATION);
9865
9866   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
9867   *vec_stmt = induction_phi;
9868
9869   /* In case that vectorization factor (VF) is bigger than the number
9870      of elements that we can fit in a vectype (nunits), we have to generate
9871      more than one vector stmt - i.e - we need to "unroll" the
9872      vector stmt by a factor VF/nunits.  For more details see documentation
9873      in vectorizable_operation.  */
9874
9875   if (ncopies > 1)
9876     {
9877       stmts = NULL;
9878       /* FORNOW. This restriction should be relaxed.  */
9879       gcc_assert (!nested_in_vect_loop);
9880
9881       new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9882                                                 nunits, induction_type);
9883
9884       vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9885                                                     new_name, vectype,
9886                                                     induction_type);
9887       vec_def = induc_def;
9888       for (i = 1; i < ncopies; i++)
9889         {
9890           /* vec_i = vec_prev + vec_step.  */
9891           stmts = NULL;
9892           vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9893                                               vec_def, vec_step,
9894                                               induction_type);
9895           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9896           new_stmt = SSA_NAME_DEF_STMT (vec_def);
9897           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
9898         }
9899     }
9900
9901   if (dump_enabled_p ())
9902     dump_printf_loc (MSG_NOTE, vect_location,
9903                      "transform induction: created def-use cycle: %G%G",
9904                      (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
9905
9906   return true;
9907 }
9908
9909 /* Function vectorizable_induction
9910
9911    Check if STMT_INFO performs an induction computation that can be vectorized.
9912    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
9913    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
9914    Return true if STMT_INFO is vectorizable in this way.  */
9915
9916 bool
9917 vectorizable_induction (loop_vec_info loop_vinfo,
9918                         stmt_vec_info stmt_info,
9919                         gimple **vec_stmt, slp_tree slp_node,
9920                         stmt_vector_for_cost *cost_vec)
9921 {
9922   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9923   unsigned ncopies;
9924   bool nested_in_vect_loop = false;
9925   class loop *iv_loop;
9926   tree vec_def;
9927   edge pe = loop_preheader_edge (loop);
9928   basic_block new_bb;
9929   tree new_vec, vec_init, vec_step, t;
9930   tree new_name;
9931   gimple *new_stmt;
9932   gphi *induction_phi;
9933   tree induc_def, vec_dest;
9934   tree init_expr, step_expr;
9935   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9936   unsigned i;
9937   tree expr;
9938   gimple_stmt_iterator si;
9939   enum vect_induction_op_type induction_type
9940     = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9941
9942   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9943   if (!phi)
9944     return false;
9945
9946   if (!STMT_VINFO_RELEVANT_P (stmt_info))
9947     return false;
9948
9949   /* Make sure it was recognized as induction computation.  */
9950   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
9951     return false;
9952
9953   /* Handle nonlinear induction in a separate place.  */
9954   if (induction_type != vect_step_op_add)
9955     return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
9956                                              vec_stmt, slp_node, cost_vec);
9957
9958   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9959   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9960
9961   if (slp_node)
9962     ncopies = 1;
9963   else
9964     ncopies = vect_get_num_copies (loop_vinfo, vectype);
9965   gcc_assert (ncopies >= 1);
9966
9967   /* FORNOW. These restrictions should be relaxed.  */
9968   if (nested_in_vect_loop_p (loop, stmt_info))
9969     {
9970       imm_use_iterator imm_iter;
9971       use_operand_p use_p;
9972       gimple *exit_phi;
9973       edge latch_e;
9974       tree loop_arg;
9975
9976       if (ncopies > 1)
9977         {
9978           if (dump_enabled_p ())
9979             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9980                              "multiple types in nested loop.\n");
9981           return false;
9982         }
9983
9984       exit_phi = NULL;
9985       latch_e = loop_latch_edge (loop->inner);
9986       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
9987       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
9988         {
9989           gimple *use_stmt = USE_STMT (use_p);
9990           if (is_gimple_debug (use_stmt))
9991             continue;
9992
9993           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
9994             {
9995               exit_phi = use_stmt;
9996               break;
9997             }
9998         }
9999       if (exit_phi)
10000         {
10001           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
10002           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
10003                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
10004             {
10005               if (dump_enabled_p ())
10006                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10007                                  "inner-loop induction only used outside "
10008                                  "of the outer vectorized loop.\n");
10009               return false;
10010             }
10011         }
10012
10013       nested_in_vect_loop = true;
10014       iv_loop = loop->inner;
10015     }
10016   else
10017     iv_loop = loop;
10018   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
10019
10020   if (slp_node && !nunits.is_constant ())
10021     {
10022       /* The current SLP code creates the step value element-by-element.  */
10023       if (dump_enabled_p ())
10024         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10025                          "SLP induction not supported for variable-length"
10026                          " vectors.\n");
10027       return false;
10028     }
10029
10030   if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
10031     {
10032       if (dump_enabled_p ())
10033         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10034                          "floating point induction vectorization disabled\n");
10035       return false;
10036     }
10037
10038   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
10039   gcc_assert (step_expr != NULL_TREE);
10040   tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
10041
10042   /* Check for backend support of PLUS/MINUS_EXPR. */
10043   if (!directly_supported_p (PLUS_EXPR, step_vectype)
10044       || !directly_supported_p (MINUS_EXPR, step_vectype))
10045     return false;
10046
10047   if (!vec_stmt) /* transformation not required.  */
10048     {
10049       unsigned inside_cost = 0, prologue_cost = 0;
10050       if (slp_node)
10051         {
10052           /* We eventually need to set a vector type on invariant
10053              arguments.  */
10054           unsigned j;
10055           slp_tree child;
10056           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
10057             if (!vect_maybe_update_slp_op_vectype
10058                 (child, SLP_TREE_VECTYPE (slp_node)))
10059               {
10060                 if (dump_enabled_p ())
10061                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10062                                    "incompatible vector types for "
10063                                    "invariants\n");
10064                 return false;
10065               }
10066           /* loop cost for vec_loop.  */
10067           inside_cost
10068             = record_stmt_cost (cost_vec,
10069                                 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
10070                                 vector_stmt, stmt_info, 0, vect_body);
10071           /* prologue cost for vec_init (if not nested) and step.  */
10072           prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
10073                                             scalar_to_vec,
10074                                             stmt_info, 0, vect_prologue);
10075         }
10076       else /* if (!slp_node) */
10077         {
10078           /* loop cost for vec_loop.  */
10079           inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
10080                                           stmt_info, 0, vect_body);
10081           /* prologue cost for vec_init and vec_step.  */
10082           prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
10083                                             stmt_info, 0, vect_prologue);
10084         }
10085       if (dump_enabled_p ())
10086         dump_printf_loc (MSG_NOTE, vect_location,
10087                          "vect_model_induction_cost: inside_cost = %d, "
10088                          "prologue_cost = %d .\n", inside_cost,
10089                          prologue_cost);
10090
10091       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
10092       DUMP_VECT_SCOPE ("vectorizable_induction");
10093       return true;
10094     }
10095
10096   /* Transform.  */
10097
10098   /* Compute a vector variable, initialized with the first VF values of
10099      the induction variable.  E.g., for an iv with IV_PHI='X' and
10100      evolution S, for a vector of 4 units, we want to compute:
10101      [X, X + S, X + 2*S, X + 3*S].  */
10102
10103   if (dump_enabled_p ())
10104     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
10105
10106   pe = loop_preheader_edge (iv_loop);
10107   /* Find the first insertion point in the BB.  */
10108   basic_block bb = gimple_bb (phi);
10109   si = gsi_after_labels (bb);
10110
10111   /* For SLP induction we have to generate several IVs as for example
10112      with group size 3 we need
10113        [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
10114        [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2].  */
10115   if (slp_node)
10116     {
10117       /* Enforced above.  */
10118       unsigned int const_nunits = nunits.to_constant ();
10119
10120       /* The initial values are vectorized, but any lanes > group_size
10121          need adjustment.  */
10122       slp_tree init_node
10123         = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
10124
10125       /* Gather steps.  Since we do not vectorize inductions as
10126          cycles we have to reconstruct the step from SCEV data.  */
10127       unsigned group_size = SLP_TREE_LANES (slp_node);
10128       tree *steps = XALLOCAVEC (tree, group_size);
10129       tree *inits = XALLOCAVEC (tree, group_size);
10130       stmt_vec_info phi_info;
10131       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
10132         {
10133           steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
10134           if (!init_node)
10135             inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
10136                                            pe->dest_idx);
10137         }
10138
10139       /* Now generate the IVs.  */
10140       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10141       gcc_assert ((const_nunits * nvects) % group_size == 0);
10142       unsigned nivs;
10143       if (nested_in_vect_loop)
10144         nivs = nvects;
10145       else
10146         {
10147           /* Compute the number of distinct IVs we need.  First reduce
10148              group_size if it is a multiple of const_nunits so we get
10149              one IV for a group_size of 4 but const_nunits 2.  */
10150           unsigned group_sizep = group_size;
10151           if (group_sizep % const_nunits == 0)
10152             group_sizep = group_sizep / const_nunits;
10153           nivs = least_common_multiple (group_sizep,
10154                                         const_nunits) / const_nunits;
10155         }
10156       tree stept = TREE_TYPE (step_vectype);
10157       tree lupdate_mul = NULL_TREE;
10158       if (!nested_in_vect_loop)
10159         {
10160           /* The number of iterations covered in one vector iteration.  */
10161           unsigned lup_mul = (nvects * const_nunits) / group_size;
10162           lupdate_mul
10163             = build_vector_from_val (step_vectype,
10164                                      SCALAR_FLOAT_TYPE_P (stept)
10165                                      ? build_real_from_wide (stept, lup_mul,
10166                                                              UNSIGNED)
10167                                      : build_int_cstu (stept, lup_mul));
10168         }
10169       tree peel_mul = NULL_TREE;
10170       gimple_seq init_stmts = NULL;
10171       if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
10172         {
10173           if (SCALAR_FLOAT_TYPE_P (stept))
10174             peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
10175                                      LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
10176           else
10177             peel_mul = gimple_convert (&init_stmts, stept,
10178                                        LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
10179           peel_mul = gimple_build_vector_from_val (&init_stmts,
10180                                                    step_vectype, peel_mul);
10181         }
10182       unsigned ivn;
10183       auto_vec<tree> vec_steps;
10184       for (ivn = 0; ivn < nivs; ++ivn)
10185         {
10186           tree_vector_builder step_elts (step_vectype, const_nunits, 1);
10187           tree_vector_builder init_elts (vectype, const_nunits, 1);
10188           tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
10189           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
10190             {
10191               /* The scalar steps of the IVs.  */
10192               tree elt = steps[(ivn*const_nunits + eltn) % group_size];
10193               elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
10194               step_elts.quick_push (elt);
10195               if (!init_node)
10196                 {
10197                   /* The scalar inits of the IVs if not vectorized.  */
10198                   elt = inits[(ivn*const_nunits + eltn) % group_size];
10199                   if (!useless_type_conversion_p (TREE_TYPE (vectype),
10200                                                   TREE_TYPE (elt)))
10201                     elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
10202                                         TREE_TYPE (vectype), elt);
10203                   init_elts.quick_push (elt);
10204                 }
10205               /* The number of steps to add to the initial values.  */
10206               unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
10207               mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
10208                                    ? build_real_from_wide (stept,
10209                                                            mul_elt, UNSIGNED)
10210                                    : build_int_cstu (stept, mul_elt));
10211             }
10212           vec_step = gimple_build_vector (&init_stmts, &step_elts);
10213           vec_steps.safe_push (vec_step);
10214           tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
10215           if (peel_mul)
10216             step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
10217                                      step_mul, peel_mul);
10218           if (!init_node)
10219             vec_init = gimple_build_vector (&init_stmts, &init_elts);
10220
10221           /* Create the induction-phi that defines the induction-operand.  */
10222           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
10223                                             "vec_iv_");
10224           induction_phi = create_phi_node (vec_dest, iv_loop->header);
10225           induc_def = PHI_RESULT (induction_phi);
10226
10227           /* Create the iv update inside the loop  */
10228           tree up = vec_step;
10229           if (lupdate_mul)
10230             up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10231                                vec_step, lupdate_mul);
10232           gimple_seq stmts = NULL;
10233           vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10234           vec_def = gimple_build (&stmts,
10235                                   PLUS_EXPR, step_vectype, vec_def, up);
10236           vec_def = gimple_convert (&stmts, vectype, vec_def);
10237           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10238           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10239                        UNKNOWN_LOCATION);
10240
10241           if (init_node)
10242             vec_init = vect_get_slp_vect_def (init_node, ivn);
10243           if (!nested_in_vect_loop
10244               && !integer_zerop (step_mul))
10245             {
10246               vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
10247               up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10248                                  vec_step, step_mul);
10249               vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
10250                                       vec_def, up);
10251               vec_init = gimple_convert (&init_stmts, vectype, vec_def);
10252             }
10253
10254           /* Set the arguments of the phi node:  */
10255           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10256
10257           slp_node->push_vec_def (induction_phi);
10258         }
10259       if (!nested_in_vect_loop)
10260         {
10261           /* Fill up to the number of vectors we need for the whole group.  */
10262           nivs = least_common_multiple (group_size,
10263                                         const_nunits) / const_nunits;
10264           vec_steps.reserve (nivs-ivn);
10265           for (; ivn < nivs; ++ivn)
10266             {
10267               slp_node->push_vec_def (SLP_TREE_VEC_DEFS (slp_node)[0]);
10268               vec_steps.quick_push (vec_steps[0]);
10269             }
10270         }
10271
10272       /* Re-use IVs when we can.  We are generating further vector
10273          stmts by adding VF' * stride to the IVs generated above.  */
10274       if (ivn < nvects)
10275         {
10276           unsigned vfp
10277             = least_common_multiple (group_size, const_nunits) / group_size;
10278           tree lupdate_mul
10279             = build_vector_from_val (step_vectype,
10280                                      SCALAR_FLOAT_TYPE_P (stept)
10281                                      ? build_real_from_wide (stept,
10282                                                              vfp, UNSIGNED)
10283                                      : build_int_cstu (stept, vfp));
10284           for (; ivn < nvects; ++ivn)
10285             {
10286               gimple *iv
10287                 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (slp_node)[ivn - nivs]);
10288               tree def = gimple_get_lhs (iv);
10289               if (ivn < 2*nivs)
10290                 vec_steps[ivn - nivs]
10291                   = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10292                                   vec_steps[ivn - nivs], lupdate_mul);
10293               gimple_seq stmts = NULL;
10294               def = gimple_convert (&stmts, step_vectype, def);
10295               def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10296                                   def, vec_steps[ivn % nivs]);
10297               def = gimple_convert (&stmts, vectype, def);
10298               if (gimple_code (iv) == GIMPLE_PHI)
10299                 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10300               else
10301                 {
10302                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
10303                   gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
10304                 }
10305               slp_node->push_vec_def (def);
10306             }
10307         }
10308
10309       new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
10310       gcc_assert (!new_bb);
10311
10312       return true;
10313     }
10314
10315   init_expr = vect_phi_initial_value (phi);
10316
10317   gimple_seq stmts = NULL;
10318   if (!nested_in_vect_loop)
10319     {
10320       /* Convert the initial value to the IV update type.  */
10321       tree new_type = TREE_TYPE (step_expr);
10322       init_expr = gimple_convert (&stmts, new_type, init_expr);
10323
10324       /* If we are using the loop mask to "peel" for alignment then we need
10325          to adjust the start value here.  */
10326       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
10327       if (skip_niters != NULL_TREE)
10328         {
10329           if (FLOAT_TYPE_P (vectype))
10330             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
10331                                         skip_niters);
10332           else
10333             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
10334           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
10335                                          skip_niters, step_expr);
10336           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
10337                                     init_expr, skip_step);
10338         }
10339     }
10340
10341   if (stmts)
10342     {
10343       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10344       gcc_assert (!new_bb);
10345     }
10346
10347   /* Create the vector that holds the initial_value of the induction.  */
10348   if (nested_in_vect_loop)
10349     {
10350       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
10351          been created during vectorization of previous stmts.  We obtain it
10352          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
10353       auto_vec<tree> vec_inits;
10354       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
10355                                      init_expr, &vec_inits);
10356       vec_init = vec_inits[0];
10357       /* If the initial value is not of proper type, convert it.  */
10358       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
10359         {
10360           new_stmt
10361             = gimple_build_assign (vect_get_new_ssa_name (vectype,
10362                                                           vect_simple_var,
10363                                                           "vec_iv_"),
10364                                    VIEW_CONVERT_EXPR,
10365                                    build1 (VIEW_CONVERT_EXPR, vectype,
10366                                            vec_init));
10367           vec_init = gimple_assign_lhs (new_stmt);
10368           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
10369                                                  new_stmt);
10370           gcc_assert (!new_bb);
10371         }
10372     }
10373   else
10374     {
10375       /* iv_loop is the loop to be vectorized. Create:
10376          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
10377       stmts = NULL;
10378       new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
10379
10380       unsigned HOST_WIDE_INT const_nunits;
10381       if (nunits.is_constant (&const_nunits))
10382         {
10383           tree_vector_builder elts (step_vectype, const_nunits, 1);
10384           elts.quick_push (new_name);
10385           for (i = 1; i < const_nunits; i++)
10386             {
10387               /* Create: new_name_i = new_name + step_expr  */
10388               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
10389                                        new_name, step_expr);
10390               elts.quick_push (new_name);
10391             }
10392           /* Create a vector from [new_name_0, new_name_1, ...,
10393              new_name_nunits-1]  */
10394           vec_init = gimple_build_vector (&stmts, &elts);
10395         }
10396       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
10397         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
10398         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
10399                                  new_name, step_expr);
10400       else
10401         {
10402           /* Build:
10403                 [base, base, base, ...]
10404                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
10405           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
10406           gcc_assert (flag_associative_math);
10407           tree index = build_index_vector (step_vectype, 0, 1);
10408           tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10409                                                         new_name);
10410           tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10411                                                         step_expr);
10412           vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
10413           vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
10414                                    vec_init, step_vec);
10415           vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10416                                    vec_init, base_vec);
10417         }
10418       vec_init = gimple_convert (&stmts, vectype, vec_init);
10419
10420       if (stmts)
10421         {
10422           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10423           gcc_assert (!new_bb);
10424         }
10425     }
10426
10427
10428   /* Create the vector that holds the step of the induction.  */
10429   gimple_stmt_iterator *step_iv_si = NULL;
10430   if (nested_in_vect_loop)
10431     /* iv_loop is nested in the loop to be vectorized. Generate:
10432        vec_step = [S, S, S, S]  */
10433     new_name = step_expr;
10434   else if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
10435     {
10436       /* When we're using loop_len produced by SELEC_VL, the non-final
10437          iterations are not always processing VF elements.  So vectorize
10438          induction variable instead of
10439
10440            _21 = vect_vec_iv_.6_22 + { VF, ... };
10441
10442          We should generate:
10443
10444            _35 = .SELECT_VL (ivtmp_33, VF);
10445            vect_cst__22 = [vec_duplicate_expr] _35;
10446            _21 = vect_vec_iv_.6_22 + vect_cst__22;  */
10447       gcc_assert (!slp_node);
10448       gimple_seq seq = NULL;
10449       vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
10450       tree len = vect_get_loop_len (loop_vinfo, NULL, lens, 1, vectype, 0, 0);
10451       expr = force_gimple_operand (fold_convert (TREE_TYPE (step_expr),
10452                                                  unshare_expr (len)),
10453                                    &seq, true, NULL_TREE);
10454       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr), expr,
10455                                step_expr);
10456       gsi_insert_seq_before (&si, seq, GSI_SAME_STMT);
10457       step_iv_si = &si;
10458     }
10459   else
10460     {
10461       /* iv_loop is the loop to be vectorized. Generate:
10462           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
10463       gimple_seq seq = NULL;
10464       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10465         {
10466           expr = build_int_cst (integer_type_node, vf);
10467           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10468         }
10469       else
10470         expr = build_int_cst (TREE_TYPE (step_expr), vf);
10471       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10472                                expr, step_expr);
10473       if (seq)
10474         {
10475           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10476           gcc_assert (!new_bb);
10477         }
10478     }
10479
10480   t = unshare_expr (new_name);
10481   gcc_assert (CONSTANT_CLASS_P (new_name)
10482               || TREE_CODE (new_name) == SSA_NAME);
10483   new_vec = build_vector_from_val (step_vectype, t);
10484   vec_step = vect_init_vector (loop_vinfo, stmt_info,
10485                                new_vec, step_vectype, step_iv_si);
10486
10487
10488   /* Create the following def-use cycle:
10489      loop prolog:
10490          vec_init = ...
10491          vec_step = ...
10492      loop:
10493          vec_iv = PHI <vec_init, vec_loop>
10494          ...
10495          STMT
10496          ...
10497          vec_loop = vec_iv + vec_step;  */
10498
10499   /* Create the induction-phi that defines the induction-operand.  */
10500   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
10501   induction_phi = create_phi_node (vec_dest, iv_loop->header);
10502   induc_def = PHI_RESULT (induction_phi);
10503
10504   /* Create the iv update inside the loop  */
10505   stmts = NULL;
10506   vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10507   vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
10508   vec_def = gimple_convert (&stmts, vectype, vec_def);
10509   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10510   new_stmt = SSA_NAME_DEF_STMT (vec_def);
10511
10512   /* Set the arguments of the phi node:  */
10513   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10514   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10515                UNKNOWN_LOCATION);
10516
10517   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
10518   *vec_stmt = induction_phi;
10519
10520   /* In case that vectorization factor (VF) is bigger than the number
10521      of elements that we can fit in a vectype (nunits), we have to generate
10522      more than one vector stmt - i.e - we need to "unroll" the
10523      vector stmt by a factor VF/nunits.  For more details see documentation
10524      in vectorizable_operation.  */
10525
10526   if (ncopies > 1)
10527     {
10528       gimple_seq seq = NULL;
10529       /* FORNOW. This restriction should be relaxed.  */
10530       gcc_assert (!nested_in_vect_loop);
10531       /* We expect LOOP_VINFO_USING_SELECT_VL_P to be false if ncopies > 1.  */
10532       gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
10533
10534       /* Create the vector that holds the step of the induction.  */
10535       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10536         {
10537           expr = build_int_cst (integer_type_node, nunits);
10538           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10539         }
10540       else
10541         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
10542       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10543                                expr, step_expr);
10544       if (seq)
10545         {
10546           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10547           gcc_assert (!new_bb);
10548         }
10549
10550       t = unshare_expr (new_name);
10551       gcc_assert (CONSTANT_CLASS_P (new_name)
10552                   || TREE_CODE (new_name) == SSA_NAME);
10553       new_vec = build_vector_from_val (step_vectype, t);
10554       vec_step = vect_init_vector (loop_vinfo, stmt_info,
10555                                    new_vec, step_vectype, NULL);
10556
10557       vec_def = induc_def;
10558       for (i = 1; i < ncopies + 1; i++)
10559         {
10560           /* vec_i = vec_prev + vec_step  */
10561           gimple_seq stmts = NULL;
10562           vec_def = gimple_convert (&stmts, step_vectype, vec_def);
10563           vec_def = gimple_build (&stmts,
10564                                   PLUS_EXPR, step_vectype, vec_def, vec_step);
10565           vec_def = gimple_convert (&stmts, vectype, vec_def);
10566
10567           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10568           if (i < ncopies)
10569             {
10570               new_stmt = SSA_NAME_DEF_STMT (vec_def);
10571               STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
10572             }
10573           else
10574             {
10575               /* vec_1 = vec_iv + (VF/n * S)
10576                  vec_2 = vec_1 + (VF/n * S)
10577                  ...
10578                  vec_n = vec_prev + (VF/n * S) = vec_iv + VF * S = vec_loop
10579
10580                  vec_n is used as vec_loop to save the large step register and
10581                  related operations.  */
10582               add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10583                            UNKNOWN_LOCATION);
10584             }
10585         }
10586     }
10587
10588   if (dump_enabled_p ())
10589     dump_printf_loc (MSG_NOTE, vect_location,
10590                      "transform induction: created def-use cycle: %G%G",
10591                      (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
10592
10593   return true;
10594 }
10595
10596 /* Function vectorizable_live_operation_1.
10597
10598    helper function for vectorizable_live_operation.  */
10599
10600 static tree
10601 vectorizable_live_operation_1 (loop_vec_info loop_vinfo,
10602                                stmt_vec_info stmt_info, basic_block exit_bb,
10603                                tree vectype, int ncopies, slp_tree slp_node,
10604                                tree bitsize, tree bitstart, tree vec_lhs,
10605                                tree lhs_type, gimple_stmt_iterator *exit_gsi)
10606 {
10607   gcc_assert (single_pred_p (exit_bb) || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
10608
10609   tree vec_lhs_phi = copy_ssa_name (vec_lhs);
10610   gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
10611   for (unsigned i = 0; i < gimple_phi_num_args (phi); i++)
10612     SET_PHI_ARG_DEF (phi, i, vec_lhs);
10613
10614   gimple_seq stmts = NULL;
10615   tree new_tree;
10616
10617   /* If bitstart is 0 then we can use a BIT_FIELD_REF  */
10618   if (integer_zerop (bitstart))
10619     {
10620       tree scalar_res = gimple_build (&stmts, BIT_FIELD_REF, TREE_TYPE (vectype),
10621                                       vec_lhs_phi, bitsize, bitstart);
10622
10623       /* Convert the extracted vector element to the scalar type.  */
10624       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10625     }
10626   else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
10627     {
10628       /* Emit:
10629
10630          SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
10631
10632          where VEC_LHS is the vectorized live-out result and MASK is
10633          the loop mask for the final iteration.  */
10634       gcc_assert (ncopies == 1 && !slp_node);
10635       gimple_seq tem = NULL;
10636       gimple_stmt_iterator gsi = gsi_last (tem);
10637       tree len = vect_get_loop_len (loop_vinfo, &gsi,
10638                                     &LOOP_VINFO_LENS (loop_vinfo),
10639                                     1, vectype, 0, 0);
10640
10641       /* BIAS - 1.  */
10642       signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
10643       tree bias_minus_one
10644         = int_const_binop (MINUS_EXPR,
10645                            build_int_cst (TREE_TYPE (len), biasval),
10646                            build_one_cst (TREE_TYPE (len)));
10647
10648       /* LAST_INDEX = LEN + (BIAS - 1).  */
10649       tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
10650                                      len, bias_minus_one);
10651
10652       /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>.  */
10653       tree scalar_res
10654         = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
10655                         vec_lhs_phi, last_index);
10656
10657       /* Convert the extracted vector element to the scalar type.  */
10658       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10659     }
10660   else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
10661     {
10662       /* Emit:
10663
10664          SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
10665
10666          where VEC_LHS is the vectorized live-out result and MASK is
10667          the loop mask for the final iteration.  */
10668       gcc_assert (!slp_node);
10669       tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
10670       gimple_seq tem = NULL;
10671       gimple_stmt_iterator gsi = gsi_last (tem);
10672       tree mask = vect_get_loop_mask (loop_vinfo, &gsi,
10673                                       &LOOP_VINFO_MASKS (loop_vinfo),
10674                                       1, vectype, 0);
10675       tree scalar_res;
10676       gimple_seq_add_seq (&stmts, tem);
10677
10678       scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
10679                                  mask, vec_lhs_phi);
10680
10681       /* Convert the extracted vector element to the scalar type.  */
10682       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10683     }
10684   else
10685     {
10686       tree bftype = TREE_TYPE (vectype);
10687       if (VECTOR_BOOLEAN_TYPE_P (vectype))
10688         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10689       new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs_phi, bitsize, bitstart);
10690       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10691                                        &stmts, true, NULL_TREE);
10692     }
10693
10694   *exit_gsi = gsi_after_labels (exit_bb);
10695   if (stmts)
10696     gsi_insert_seq_before (exit_gsi, stmts, GSI_SAME_STMT);
10697
10698   return new_tree;
10699 }
10700
10701 /* Find the edge that's the final one in the path from SRC to DEST and
10702    return it.  This edge must exist in at most one forwarder edge between.  */
10703
10704 static edge
10705 find_connected_edge (edge src, basic_block dest)
10706 {
10707    if (src->dest == dest)
10708      return src;
10709
10710   return find_edge (src->dest, dest);
10711 }
10712
10713 /* Function vectorizable_live_operation.
10714
10715    STMT_INFO computes a value that is used outside the loop.  Check if
10716    it can be supported.  */
10717
10718 bool
10719 vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
10720                              slp_tree slp_node, slp_instance slp_node_instance,
10721                              int slp_index, bool vec_stmt_p,
10722                              stmt_vector_for_cost *cost_vec)
10723 {
10724   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
10725   imm_use_iterator imm_iter;
10726   tree lhs, lhs_type, bitsize;
10727   tree vectype = (slp_node
10728                   ? SLP_TREE_VECTYPE (slp_node)
10729                   : STMT_VINFO_VECTYPE (stmt_info));
10730   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10731   int ncopies;
10732   gimple *use_stmt;
10733   use_operand_p use_p;
10734   auto_vec<tree> vec_oprnds;
10735   int vec_entry = 0;
10736   poly_uint64 vec_index = 0;
10737
10738   gcc_assert (STMT_VINFO_LIVE_P (stmt_info)
10739               || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
10740
10741   /* If a stmt of a reduction is live, vectorize it via
10742      vect_create_epilog_for_reduction.  vectorizable_reduction assessed
10743      validity so just trigger the transform here.  */
10744   if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
10745     {
10746       if (!vec_stmt_p)
10747         return true;
10748       if (slp_node)
10749         {
10750           /* For reduction chains the meta-info is attached to
10751              the group leader.  */
10752           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
10753             stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
10754           /* For SLP reductions we vectorize the epilogue for
10755              all involved stmts together.  */
10756           else if (slp_index != 0)
10757             return true;
10758         }
10759       stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
10760       gcc_assert (reduc_info->is_reduc_info);
10761       if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
10762           || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
10763         return true;
10764
10765       vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
10766                                         slp_node_instance,
10767                                         LOOP_VINFO_IV_EXIT (loop_vinfo));
10768
10769       /* If early break we only have to materialize the reduction on the merge
10770          block, but we have to find an alternate exit first.  */
10771       if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
10772         {
10773           for (auto exit : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
10774             if (exit != LOOP_VINFO_IV_EXIT (loop_vinfo))
10775               {
10776                 vect_create_epilog_for_reduction (loop_vinfo, stmt_info,
10777                                                   slp_node, slp_node_instance,
10778                                                   exit);
10779                 break;
10780               }
10781         }
10782
10783       return true;
10784     }
10785
10786   /* If STMT is not relevant and it is a simple assignment and its inputs are
10787      invariant then it can remain in place, unvectorized.  The original last
10788      scalar value that it computes will be used.  */
10789   if (!STMT_VINFO_RELEVANT_P (stmt_info))
10790     {
10791       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
10792       if (dump_enabled_p ())
10793         dump_printf_loc (MSG_NOTE, vect_location,
10794                          "statement is simple and uses invariant.  Leaving in "
10795                          "place.\n");
10796       return true;
10797     }
10798
10799   if (slp_node)
10800     ncopies = 1;
10801   else
10802     ncopies = vect_get_num_copies (loop_vinfo, vectype);
10803
10804   if (slp_node)
10805     {
10806       gcc_assert (slp_index >= 0);
10807
10808       /* Get the last occurrence of the scalar index from the concatenation of
10809          all the slp vectors. Calculate which slp vector it is and the index
10810          within.  */
10811       int num_scalar = SLP_TREE_LANES (slp_node);
10812       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10813       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
10814
10815       /* Calculate which vector contains the result, and which lane of
10816          that vector we need.  */
10817       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
10818         {
10819           if (dump_enabled_p ())
10820             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10821                              "Cannot determine which vector holds the"
10822                              " final result.\n");
10823           return false;
10824         }
10825     }
10826
10827   if (!vec_stmt_p)
10828     {
10829       /* No transformation required.  */
10830       if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
10831         {
10832           if (slp_node)
10833             {
10834               if (dump_enabled_p ())
10835                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10836                                  "can't operate on partial vectors "
10837                                  "because an SLP statement is live after "
10838                                  "the loop.\n");
10839               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10840             }
10841           else if (ncopies > 1)
10842             {
10843               if (dump_enabled_p ())
10844                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10845                                  "can't operate on partial vectors "
10846                                  "because ncopies is greater than 1.\n");
10847               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10848             }
10849           else
10850             {
10851               gcc_assert (ncopies == 1 && !slp_node);
10852               if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
10853                                                   OPTIMIZE_FOR_SPEED))
10854                 vect_record_loop_mask (loop_vinfo,
10855                                        &LOOP_VINFO_MASKS (loop_vinfo),
10856                                        1, vectype, NULL);
10857               else if (can_vec_extract_var_idx_p (
10858                          TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
10859                 vect_record_loop_len (loop_vinfo,
10860                                       &LOOP_VINFO_LENS (loop_vinfo),
10861                                       1, vectype, 1);
10862               else
10863                 {
10864                   if (dump_enabled_p ())
10865                     dump_printf_loc (
10866                       MSG_MISSED_OPTIMIZATION, vect_location,
10867                       "can't operate on partial vectors "
10868                       "because the target doesn't support extract "
10869                       "last reduction.\n");
10870                   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10871                 }
10872             }
10873         }
10874       /* ???  Enable for loop costing as well.  */
10875       if (!loop_vinfo)
10876         record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
10877                           0, vect_epilogue);
10878       return true;
10879     }
10880
10881   /* Use the lhs of the original scalar statement.  */
10882   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
10883   if (dump_enabled_p ())
10884     dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
10885                      "stmt %G", stmt);
10886
10887   lhs = gimple_get_lhs (stmt);
10888   lhs_type = TREE_TYPE (lhs);
10889
10890   bitsize = vector_element_bits_tree (vectype);
10891
10892   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
10893   tree vec_lhs, vec_lhs0, bitstart;
10894   gimple *vec_stmt, *vec_stmt0;
10895   if (slp_node)
10896     {
10897       gcc_assert (!loop_vinfo
10898                   || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
10899                       && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)));
10900
10901       /* Get the correct slp vectorized stmt.  */
10902       vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
10903       vec_stmt = SSA_NAME_DEF_STMT (vec_lhs);
10904
10905       /* In case we need to early break vectorize also get the first stmt.  */
10906       vec_lhs0 = SLP_TREE_VEC_DEFS (slp_node)[0];
10907       vec_stmt0 = SSA_NAME_DEF_STMT (vec_lhs0);
10908
10909       /* Get entry to use.  */
10910       bitstart = bitsize_int (vec_index);
10911       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
10912     }
10913   else
10914     {
10915       /* For multiple copies, get the last copy.  */
10916       vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
10917       vec_lhs = gimple_get_lhs (vec_stmt);
10918
10919       /* In case we need to early break vectorize also get the first stmt.  */
10920       vec_stmt0 = STMT_VINFO_VEC_STMTS (stmt_info)[0];
10921       vec_lhs0 = gimple_get_lhs (vec_stmt0);
10922
10923       /* Get the last lane in the vector.  */
10924       bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
10925     }
10926
10927   if (loop_vinfo)
10928     {
10929       /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
10930          requirement, insert one phi node for it.  It looks like:
10931            loop;
10932          BB:
10933            # lhs' = PHI <lhs>
10934          ==>
10935            loop;
10936          BB:
10937            # vec_lhs' = PHI <vec_lhs>
10938            new_tree = lane_extract <vec_lhs', ...>;
10939            lhs' = new_tree;  */
10940
10941       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10942       /* Check if we have a loop where the chosen exit is not the main exit,
10943          in these cases for an early break we restart the iteration the vector code
10944          did.  For the live values we want the value at the start of the iteration
10945          rather than at the end.  */
10946       edge main_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
10947       bool restart_loop = LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo);
10948       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10949         if (!is_gimple_debug (use_stmt)
10950             && !flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
10951           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10952             {
10953               edge e = gimple_phi_arg_edge (as_a <gphi *> (use_stmt),
10954                                            phi_arg_index_from_use (use_p));
10955               bool main_exit_edge = e == main_e
10956                                     || find_connected_edge (main_e, e->src);
10957
10958               /* Early exits have an merge block, we want the merge block itself
10959                  so use ->src.  For main exit the merge block is the
10960                  destination.  */
10961               basic_block dest = main_exit_edge ? main_e->dest : e->src;
10962               tree tmp_vec_lhs = vec_lhs;
10963               tree tmp_bitstart = bitstart;
10964
10965               /* For early exit where the exit is not in the BB that leads
10966                  to the latch then we're restarting the iteration in the
10967                  scalar loop.  So get the first live value.  */
10968               restart_loop = restart_loop || !main_exit_edge;
10969               if (restart_loop
10970                   && STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
10971                 {
10972                   tmp_vec_lhs = vec_lhs0;
10973                   tmp_bitstart = build_zero_cst (TREE_TYPE (bitstart));
10974                 }
10975
10976               gimple_stmt_iterator exit_gsi;
10977               tree new_tree
10978                 = vectorizable_live_operation_1 (loop_vinfo, stmt_info,
10979                                                  dest, vectype, ncopies,
10980                                                  slp_node, bitsize,
10981                                                  tmp_bitstart, tmp_vec_lhs,
10982                                                  lhs_type, &exit_gsi);
10983
10984               if (gimple_phi_num_args (use_stmt) == 1)
10985                 {
10986                   auto gsi = gsi_for_stmt (use_stmt);
10987                   remove_phi_node (&gsi, false);
10988                   tree lhs_phi = gimple_phi_result (use_stmt);
10989                   gimple *copy = gimple_build_assign (lhs_phi, new_tree);
10990                   gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
10991                 }
10992               else
10993                 SET_PHI_ARG_DEF (use_stmt, e->dest_idx, new_tree);
10994           }
10995
10996       /* There a no further out-of-loop uses of lhs by LC-SSA construction.  */
10997       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10998         gcc_assert (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)));
10999     }
11000   else
11001     {
11002       /* For basic-block vectorization simply insert the lane-extraction.  */
11003       tree bftype = TREE_TYPE (vectype);
11004       if (VECTOR_BOOLEAN_TYPE_P (vectype))
11005         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
11006       tree new_tree = build3 (BIT_FIELD_REF, bftype,
11007                               vec_lhs, bitsize, bitstart);
11008       gimple_seq stmts = NULL;
11009       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
11010                                        &stmts, true, NULL_TREE);
11011       if (TREE_CODE (new_tree) == SSA_NAME
11012           && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
11013         SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
11014       if (is_a <gphi *> (vec_stmt))
11015         {
11016           gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
11017           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
11018         }
11019       else
11020         {
11021           gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
11022           gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
11023         }
11024
11025       /* Replace use of lhs with newly computed result.  If the use stmt is a
11026          single arg PHI, just replace all uses of PHI result.  It's necessary
11027          because lcssa PHI defining lhs may be before newly inserted stmt.  */
11028       use_operand_p use_p;
11029       stmt_vec_info use_stmt_info;
11030       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
11031         if (!is_gimple_debug (use_stmt)
11032             && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
11033                 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
11034           {
11035             /* ???  This can happen when the live lane ends up being
11036                rooted in a vector construction code-generated by an
11037                external SLP node (and code-generation for that already
11038                happened).  See gcc.dg/vect/bb-slp-47.c.
11039                Doing this is what would happen if that vector CTOR
11040                were not code-generated yet so it is not too bad.
11041                ???  In fact we'd likely want to avoid this situation
11042                in the first place.  */
11043             if (TREE_CODE (new_tree) == SSA_NAME
11044                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
11045                 && gimple_code (use_stmt) != GIMPLE_PHI
11046                 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
11047                                                 use_stmt))
11048               {
11049                 if (dump_enabled_p ())
11050                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11051                                    "Using original scalar computation for "
11052                                    "live lane because use preceeds vector "
11053                                    "def\n");
11054                 continue;
11055               }
11056             /* ???  It can also happen that we end up pulling a def into
11057                a loop where replacing out-of-loop uses would require
11058                a new LC SSA PHI node.  Retain the original scalar in
11059                those cases as well.  PR98064.  */
11060             if (TREE_CODE (new_tree) == SSA_NAME
11061                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
11062                 && (gimple_bb (use_stmt)->loop_father
11063                     != gimple_bb (vec_stmt)->loop_father)
11064                 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
11065                                         gimple_bb (use_stmt)->loop_father))
11066               {
11067                 if (dump_enabled_p ())
11068                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11069                                    "Using original scalar computation for "
11070                                    "live lane because there is an out-of-loop "
11071                                    "definition for it\n");
11072                 continue;
11073               }
11074             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
11075               SET_USE (use_p, new_tree);
11076             update_stmt (use_stmt);
11077           }
11078     }
11079
11080   return true;
11081 }
11082
11083 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
11084
11085 static void
11086 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
11087 {
11088   ssa_op_iter op_iter;
11089   imm_use_iterator imm_iter;
11090   def_operand_p def_p;
11091   gimple *ustmt;
11092
11093   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
11094     {
11095       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
11096         {
11097           basic_block bb;
11098
11099           if (!is_gimple_debug (ustmt))
11100             continue;
11101
11102           bb = gimple_bb (ustmt);
11103
11104           if (!flow_bb_inside_loop_p (loop, bb))
11105             {
11106               if (gimple_debug_bind_p (ustmt))
11107                 {
11108                   if (dump_enabled_p ())
11109                     dump_printf_loc (MSG_NOTE, vect_location,
11110                                      "killing debug use\n");
11111
11112                   gimple_debug_bind_reset_value (ustmt);
11113                   update_stmt (ustmt);
11114                 }
11115               else
11116                 gcc_unreachable ();
11117             }
11118         }
11119     }
11120 }
11121
11122 /* Given loop represented by LOOP_VINFO, return true if computation of
11123    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
11124    otherwise.  */
11125
11126 static bool
11127 loop_niters_no_overflow (loop_vec_info loop_vinfo)
11128 {
11129   /* Constant case.  */
11130   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
11131     {
11132       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
11133       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
11134
11135       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
11136       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
11137       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
11138         return true;
11139     }
11140
11141   widest_int max;
11142   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11143   /* Check the upper bound of loop niters.  */
11144   if (get_max_loop_iterations (loop, &max))
11145     {
11146       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
11147       signop sgn = TYPE_SIGN (type);
11148       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
11149       if (max < type_max)
11150         return true;
11151     }
11152   return false;
11153 }
11154
11155 /* Return a mask type with half the number of elements as OLD_TYPE,
11156    given that it should have mode NEW_MODE.  */
11157
11158 tree
11159 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
11160 {
11161   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
11162   return build_truth_vector_type_for_mode (nunits, new_mode);
11163 }
11164
11165 /* Return a mask type with twice as many elements as OLD_TYPE,
11166    given that it should have mode NEW_MODE.  */
11167
11168 tree
11169 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
11170 {
11171   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
11172   return build_truth_vector_type_for_mode (nunits, new_mode);
11173 }
11174
11175 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
11176    contain a sequence of NVECTORS masks that each control a vector of type
11177    VECTYPE.  If SCALAR_MASK is nonnull, the fully-masked loop would AND
11178    these vector masks with the vector version of SCALAR_MASK.  */
11179
11180 void
11181 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
11182                        unsigned int nvectors, tree vectype, tree scalar_mask)
11183 {
11184   gcc_assert (nvectors != 0);
11185
11186   if (scalar_mask)
11187     {
11188       scalar_cond_masked_key cond (scalar_mask, nvectors);
11189       loop_vinfo->scalar_cond_masked_set.add (cond);
11190     }
11191
11192   masks->mask_set.add (std::make_pair (vectype, nvectors));
11193 }
11194
11195 /* Given a complete set of masks MASKS, extract mask number INDEX
11196    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
11197    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
11198
11199    See the comment above vec_loop_masks for more details about the mask
11200    arrangement.  */
11201
11202 tree
11203 vect_get_loop_mask (loop_vec_info loop_vinfo,
11204                     gimple_stmt_iterator *gsi, vec_loop_masks *masks,
11205                     unsigned int nvectors, tree vectype, unsigned int index)
11206 {
11207   if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
11208       == vect_partial_vectors_while_ult)
11209     {
11210       rgroup_controls *rgm = &(masks->rgc_vec)[nvectors - 1];
11211       tree mask_type = rgm->type;
11212
11213       /* Populate the rgroup's mask array, if this is the first time we've
11214          used it.  */
11215       if (rgm->controls.is_empty ())
11216         {
11217           rgm->controls.safe_grow_cleared (nvectors, true);
11218           for (unsigned int i = 0; i < nvectors; ++i)
11219             {
11220               tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
11221               /* Provide a dummy definition until the real one is available.  */
11222               SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
11223               rgm->controls[i] = mask;
11224             }
11225         }
11226
11227       tree mask = rgm->controls[index];
11228       if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
11229                     TYPE_VECTOR_SUBPARTS (vectype)))
11230         {
11231           /* A loop mask for data type X can be reused for data type Y
11232              if X has N times more elements than Y and if Y's elements
11233              are N times bigger than X's.  In this case each sequence
11234              of N elements in the loop mask will be all-zero or all-one.
11235              We can then view-convert the mask so that each sequence of
11236              N elements is replaced by a single element.  */
11237           gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
11238                                   TYPE_VECTOR_SUBPARTS (vectype)));
11239           gimple_seq seq = NULL;
11240           mask_type = truth_type_for (vectype);
11241           mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
11242           if (seq)
11243             gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11244         }
11245       return mask;
11246     }
11247   else if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
11248            == vect_partial_vectors_avx512)
11249     {
11250       /* The number of scalars per iteration and the number of vectors are
11251          both compile-time constants.  */
11252       unsigned int nscalars_per_iter
11253         = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11254                      LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
11255
11256       rgroup_controls *rgm = &masks->rgc_vec[nscalars_per_iter - 1];
11257
11258       /* The stored nV is dependent on the mask type produced.  */
11259       gcc_assert (exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11260                              TYPE_VECTOR_SUBPARTS (rgm->type)).to_constant ()
11261                   == rgm->factor);
11262       nvectors = rgm->factor;
11263
11264       /* Populate the rgroup's mask array, if this is the first time we've
11265          used it.  */
11266       if (rgm->controls.is_empty ())
11267         {
11268           rgm->controls.safe_grow_cleared (nvectors, true);
11269           for (unsigned int i = 0; i < nvectors; ++i)
11270             {
11271               tree mask = make_temp_ssa_name (rgm->type, NULL, "loop_mask");
11272               /* Provide a dummy definition until the real one is available.  */
11273               SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
11274               rgm->controls[i] = mask;
11275             }
11276         }
11277       if (known_eq (TYPE_VECTOR_SUBPARTS (rgm->type),
11278                     TYPE_VECTOR_SUBPARTS (vectype)))
11279         return rgm->controls[index];
11280
11281       /* Split the vector if needed.  Since we are dealing with integer mode
11282          masks with AVX512 we can operate on the integer representation
11283          performing the whole vector shifting.  */
11284       unsigned HOST_WIDE_INT factor;
11285       bool ok = constant_multiple_p (TYPE_VECTOR_SUBPARTS (rgm->type),
11286                                      TYPE_VECTOR_SUBPARTS (vectype), &factor);
11287       gcc_assert (ok);
11288       gcc_assert (GET_MODE_CLASS (TYPE_MODE (rgm->type)) == MODE_INT);
11289       tree mask_type = truth_type_for (vectype);
11290       gcc_assert (GET_MODE_CLASS (TYPE_MODE (mask_type)) == MODE_INT);
11291       unsigned vi = index / factor;
11292       unsigned vpart = index % factor;
11293       tree vec = rgm->controls[vi];
11294       gimple_seq seq = NULL;
11295       vec = gimple_build (&seq, VIEW_CONVERT_EXPR,
11296                           lang_hooks.types.type_for_mode
11297                                 (TYPE_MODE (rgm->type), 1), vec);
11298       /* For integer mode masks simply shift the right bits into position.  */
11299       if (vpart != 0)
11300         vec = gimple_build (&seq, RSHIFT_EXPR, TREE_TYPE (vec), vec,
11301                             build_int_cst (integer_type_node,
11302                                            (TYPE_VECTOR_SUBPARTS (vectype)
11303                                             * vpart)));
11304       vec = gimple_convert (&seq, lang_hooks.types.type_for_mode
11305                                     (TYPE_MODE (mask_type), 1), vec);
11306       vec = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, vec);
11307       if (seq)
11308         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11309       return vec;
11310     }
11311   else
11312     gcc_unreachable ();
11313 }
11314
11315 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
11316    lengths for controlling an operation on VECTYPE.  The operation splits
11317    each element of VECTYPE into FACTOR separate subelements, measuring the
11318    length as a number of these subelements.  */
11319
11320 void
11321 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
11322                       unsigned int nvectors, tree vectype, unsigned int factor)
11323 {
11324   gcc_assert (nvectors != 0);
11325   if (lens->length () < nvectors)
11326     lens->safe_grow_cleared (nvectors, true);
11327   rgroup_controls *rgl = &(*lens)[nvectors - 1];
11328
11329   /* The number of scalars per iteration, scalar occupied bytes and
11330      the number of vectors are both compile-time constants.  */
11331   unsigned int nscalars_per_iter
11332     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11333                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
11334
11335   if (rgl->max_nscalars_per_iter < nscalars_per_iter)
11336     {
11337       /* For now, we only support cases in which all loads and stores fall back
11338          to VnQI or none do.  */
11339       gcc_assert (!rgl->max_nscalars_per_iter
11340                   || (rgl->factor == 1 && factor == 1)
11341                   || (rgl->max_nscalars_per_iter * rgl->factor
11342                       == nscalars_per_iter * factor));
11343       rgl->max_nscalars_per_iter = nscalars_per_iter;
11344       rgl->type = vectype;
11345       rgl->factor = factor;
11346     }
11347 }
11348
11349 /* Given a complete set of lengths LENS, extract length number INDEX
11350    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
11351    where 0 <= INDEX < NVECTORS.  Return a value that contains FACTOR
11352    multipled by the number of elements that should be processed.
11353    Insert any set-up statements before GSI.  */
11354
11355 tree
11356 vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
11357                    vec_loop_lens *lens, unsigned int nvectors, tree vectype,
11358                    unsigned int index, unsigned int factor)
11359 {
11360   rgroup_controls *rgl = &(*lens)[nvectors - 1];
11361   bool use_bias_adjusted_len =
11362     LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
11363
11364   /* Populate the rgroup's len array, if this is the first time we've
11365      used it.  */
11366   if (rgl->controls.is_empty ())
11367     {
11368       rgl->controls.safe_grow_cleared (nvectors, true);
11369       for (unsigned int i = 0; i < nvectors; ++i)
11370         {
11371           tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
11372           gcc_assert (len_type != NULL_TREE);
11373
11374           tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
11375
11376           /* Provide a dummy definition until the real one is available.  */
11377           SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
11378           rgl->controls[i] = len;
11379
11380           if (use_bias_adjusted_len)
11381             {
11382               gcc_assert (i == 0);
11383               tree adjusted_len =
11384                 make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
11385               SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
11386               rgl->bias_adjusted_ctrl = adjusted_len;
11387             }
11388         }
11389     }
11390
11391   if (use_bias_adjusted_len)
11392     return rgl->bias_adjusted_ctrl;
11393
11394   tree loop_len = rgl->controls[index];
11395   if (rgl->factor == 1 && factor == 1)
11396     {
11397       poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
11398       poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
11399       if (maybe_ne (nunits1, nunits2))
11400         {
11401           /* A loop len for data type X can be reused for data type Y
11402              if X has N times more elements than Y and if Y's elements
11403              are N times bigger than X's.  */
11404           gcc_assert (multiple_p (nunits1, nunits2));
11405           factor = exact_div (nunits1, nunits2).to_constant ();
11406           tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
11407           gimple_seq seq = NULL;
11408           loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
11409                                    build_int_cst (iv_type, factor));
11410           if (seq)
11411             gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11412         }
11413     }
11414   return loop_len;
11415 }
11416
11417 /* Scale profiling counters by estimation for LOOP which is vectorized
11418    by factor VF.
11419    If FLAT is true, the loop we started with had unrealistically flat
11420    profile.  */
11421
11422 static void
11423 scale_profile_for_vect_loop (class loop *loop, edge exit_e, unsigned vf, bool flat)
11424 {
11425   /* For flat profiles do not scale down proportionally by VF and only
11426      cap by known iteration count bounds.  */
11427   if (flat)
11428     {
11429       if (dump_file && (dump_flags & TDF_DETAILS))
11430         fprintf (dump_file,
11431                  "Vectorized loop profile seems flat; not scaling iteration "
11432                  "count down by the vectorization factor %i\n", vf);
11433       scale_loop_profile (loop, profile_probability::always (),
11434                           get_likely_max_loop_iterations_int (loop));
11435       return;
11436     }
11437   /* Loop body executes VF fewer times and exit increases VF times.  */
11438   profile_count entry_count = loop_preheader_edge (loop)->count ();
11439
11440   /* If we have unreliable loop profile avoid dropping entry
11441      count bellow header count.  This can happen since loops
11442      has unrealistically low trip counts.  */
11443   while (vf > 1
11444          && loop->header->count > entry_count
11445          && loop->header->count < entry_count * vf)
11446     {
11447       if (dump_file && (dump_flags & TDF_DETAILS))
11448         fprintf (dump_file,
11449                  "Vectorization factor %i seems too large for profile "
11450                  "prevoiusly believed to be consistent; reducing.\n", vf);
11451       vf /= 2;
11452     }
11453
11454   if (entry_count.nonzero_p ())
11455     set_edge_probability_and_rescale_others
11456             (exit_e,
11457              entry_count.probability_in (loop->header->count / vf));
11458   /* Avoid producing very large exit probability when we do not have
11459      sensible profile.  */
11460   else if (exit_e->probability < profile_probability::always () / (vf * 2))
11461     set_edge_probability_and_rescale_others (exit_e, exit_e->probability * vf);
11462   loop->latch->count = single_pred_edge (loop->latch)->count ();
11463
11464   scale_loop_profile (loop, profile_probability::always () / vf,
11465                       get_likely_max_loop_iterations_int (loop));
11466 }
11467
11468 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
11469    latch edge values originally defined by it.  */
11470
11471 static void
11472 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
11473                                      stmt_vec_info def_stmt_info)
11474 {
11475   tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
11476   if (!def || TREE_CODE (def) != SSA_NAME)
11477     return;
11478   stmt_vec_info phi_info;
11479   imm_use_iterator iter;
11480   use_operand_p use_p;
11481   FOR_EACH_IMM_USE_FAST (use_p, iter, def)
11482     {
11483       gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p));
11484       if (!phi)
11485         continue;
11486       if (!(gimple_bb (phi)->loop_father->header == gimple_bb (phi)
11487             && (phi_info = loop_vinfo->lookup_stmt (phi))
11488             && STMT_VINFO_RELEVANT_P (phi_info)))
11489         continue;
11490       loop_p loop = gimple_bb (phi)->loop_father;
11491       edge e = loop_latch_edge (loop);
11492       if (PHI_ARG_DEF_FROM_EDGE (phi, e) != def)
11493         continue;
11494
11495       if (VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
11496           && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
11497           && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
11498         {
11499           vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11500           vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11501           gcc_assert (phi_defs.length () == latch_defs.length ());
11502           for (unsigned i = 0; i < phi_defs.length (); ++i)
11503             add_phi_arg (as_a <gphi *> (phi_defs[i]),
11504                          gimple_get_lhs (latch_defs[i]), e,
11505                          gimple_phi_arg_location (phi, e->dest_idx));
11506         }
11507       else if (STMT_VINFO_DEF_TYPE (phi_info) == vect_first_order_recurrence)
11508         {
11509           /* For first order recurrences we have to update both uses of
11510              the latch definition, the one in the PHI node and the one
11511              in the generated VEC_PERM_EXPR.  */
11512           vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11513           vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11514           gcc_assert (phi_defs.length () == latch_defs.length ());
11515           tree phidef = gimple_assign_rhs1 (phi_defs[0]);
11516           gphi *vphi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
11517           for (unsigned i = 0; i < phi_defs.length (); ++i)
11518             {
11519               gassign *perm = as_a <gassign *> (phi_defs[i]);
11520               if (i > 0)
11521                 gimple_assign_set_rhs1 (perm, gimple_get_lhs (latch_defs[i-1]));
11522               gimple_assign_set_rhs2 (perm, gimple_get_lhs (latch_defs[i]));
11523               update_stmt (perm);
11524             }
11525           add_phi_arg (vphi, gimple_get_lhs (latch_defs.last ()), e,
11526                        gimple_phi_arg_location (phi, e->dest_idx));
11527         }
11528     }
11529 }
11530
11531 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
11532    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
11533    stmt_vec_info.  */
11534
11535 static bool
11536 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
11537                           gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
11538 {
11539   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11540   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11541
11542   if (dump_enabled_p ())
11543     dump_printf_loc (MSG_NOTE, vect_location,
11544                      "------>vectorizing statement: %G", stmt_info->stmt);
11545
11546   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
11547     vect_loop_kill_debug_uses (loop, stmt_info);
11548
11549   if (!STMT_VINFO_RELEVANT_P (stmt_info)
11550       && !STMT_VINFO_LIVE_P (stmt_info))
11551     {
11552       if (is_gimple_call (stmt_info->stmt)
11553           && gimple_call_internal_p (stmt_info->stmt, IFN_MASK_CALL))
11554         {
11555           gcc_assert (!gimple_call_lhs (stmt_info->stmt));
11556           *seen_store = stmt_info;
11557           return false;
11558         }
11559       return false;
11560     }
11561
11562   if (STMT_VINFO_VECTYPE (stmt_info))
11563     {
11564       poly_uint64 nunits
11565         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
11566       if (!STMT_SLP_TYPE (stmt_info)
11567           && maybe_ne (nunits, vf)
11568           && dump_enabled_p ())
11569         /* For SLP VF is set according to unrolling factor, and not
11570            to vector size, hence for SLP this print is not valid.  */
11571         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
11572     }
11573
11574   /* Pure SLP statements have already been vectorized.  We still need
11575      to apply loop vectorization to hybrid SLP statements.  */
11576   if (PURE_SLP_STMT (stmt_info))
11577     return false;
11578
11579   if (dump_enabled_p ())
11580     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
11581
11582   if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
11583     *seen_store = stmt_info;
11584
11585   return true;
11586 }
11587
11588 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
11589    in the hash_map with its corresponding values.  */
11590
11591 static tree
11592 find_in_mapping (tree t, void *context)
11593 {
11594   hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
11595
11596   tree *value = mapping->get (t);
11597   return value ? *value : t;
11598 }
11599
11600 /* Update EPILOGUE's loop_vec_info.  EPILOGUE was constructed as a copy of the
11601    original loop that has now been vectorized.
11602
11603    The inits of the data_references need to be advanced with the number of
11604    iterations of the main loop.  This has been computed in vect_do_peeling and
11605    is stored in parameter ADVANCE.  We first restore the data_references
11606    initial offset with the values recored in ORIG_DRS_INIT.
11607
11608    Since the loop_vec_info of this EPILOGUE was constructed for the original
11609    loop, its stmt_vec_infos all point to the original statements.  These need
11610    to be updated to point to their corresponding copies as well as the SSA_NAMES
11611    in their PATTERN_DEF_SEQs and RELATED_STMTs.
11612
11613    The data_reference's connections also need to be updated.  Their
11614    corresponding dr_vec_info need to be reconnected to the EPILOGUE's
11615    stmt_vec_infos, their statements need to point to their corresponding copy,
11616    if they are gather loads or scatter stores then their reference needs to be
11617    updated to point to its corresponding copy and finally we set
11618    'base_misaligned' to false as we have already peeled for alignment in the
11619    prologue of the main loop.  */
11620
11621 static void
11622 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
11623 {
11624   loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
11625   auto_vec<gimple *> stmt_worklist;
11626   hash_map<tree,tree> mapping;
11627   gimple *orig_stmt, *new_stmt;
11628   gimple_stmt_iterator epilogue_gsi;
11629   gphi_iterator epilogue_phi_gsi;
11630   stmt_vec_info stmt_vinfo = NULL, related_vinfo;
11631   basic_block *epilogue_bbs = get_loop_body (epilogue);
11632   unsigned i;
11633
11634   free (LOOP_VINFO_BBS (epilogue_vinfo));
11635   LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
11636
11637   /* Advance data_reference's with the number of iterations of the previous
11638      loop and its prologue.  */
11639   vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
11640
11641
11642   /* The EPILOGUE loop is a copy of the original loop so they share the same
11643      gimple UIDs.  In this loop we update the loop_vec_info of the EPILOGUE to
11644      point to the copied statements.  We also create a mapping of all LHS' in
11645      the original loop and all the LHS' in the EPILOGUE and create worklists to
11646      update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs.  */
11647   for (unsigned i = 0; i < epilogue->num_nodes; ++i)
11648     {
11649       for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
11650            !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
11651         {
11652           new_stmt = epilogue_phi_gsi.phi ();
11653
11654           gcc_assert (gimple_uid (new_stmt) > 0);
11655           stmt_vinfo
11656             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11657
11658           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11659           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11660
11661           mapping.put (gimple_phi_result (orig_stmt),
11662                        gimple_phi_result (new_stmt));
11663           /* PHI nodes can not have patterns or related statements.  */
11664           gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
11665                       && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
11666         }
11667
11668       for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
11669            !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
11670         {
11671           new_stmt = gsi_stmt (epilogue_gsi);
11672           if (is_gimple_debug (new_stmt))
11673             continue;
11674
11675           gcc_assert (gimple_uid (new_stmt) > 0);
11676           stmt_vinfo
11677             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11678
11679           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11680           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11681
11682           if (tree old_lhs = gimple_get_lhs (orig_stmt))
11683             mapping.put (old_lhs, gimple_get_lhs (new_stmt));
11684
11685           if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
11686             {
11687               gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
11688               for (gimple_stmt_iterator gsi = gsi_start (seq);
11689                    !gsi_end_p (gsi); gsi_next (&gsi))
11690                 stmt_worklist.safe_push (gsi_stmt (gsi));
11691             }
11692
11693           related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
11694           if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
11695             {
11696               gimple *stmt = STMT_VINFO_STMT (related_vinfo);
11697               stmt_worklist.safe_push (stmt);
11698               /* Set BB such that the assert in
11699                 'get_initial_def_for_reduction' is able to determine that
11700                 the BB of the related stmt is inside this loop.  */
11701               gimple_set_bb (stmt,
11702                              gimple_bb (new_stmt));
11703               related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
11704               gcc_assert (related_vinfo == NULL
11705                           || related_vinfo == stmt_vinfo);
11706             }
11707         }
11708     }
11709
11710   /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
11711      using the original main loop and thus need to be updated to refer to the
11712      cloned variables used in the epilogue.  */
11713   for (unsigned i = 0; i < stmt_worklist.length (); ++i)
11714     {
11715       gimple *stmt = stmt_worklist[i];
11716       tree *new_op;
11717
11718       for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
11719         {
11720           tree op = gimple_op (stmt, j);
11721           if ((new_op = mapping.get(op)))
11722             gimple_set_op (stmt, j, *new_op);
11723           else
11724             {
11725               /* PR92429: The last argument of simplify_replace_tree disables
11726                  folding when replacing arguments.  This is required as
11727                  otherwise you might end up with different statements than the
11728                  ones analyzed in vect_loop_analyze, leading to different
11729                  vectorization.  */
11730               op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
11731                                           &find_in_mapping, &mapping, false);
11732               gimple_set_op (stmt, j, op);
11733             }
11734         }
11735     }
11736
11737   struct data_reference *dr;
11738   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
11739   FOR_EACH_VEC_ELT (datarefs, i, dr)
11740     {
11741       orig_stmt = DR_STMT (dr);
11742       gcc_assert (gimple_uid (orig_stmt) > 0);
11743       stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
11744       /* Data references for gather loads and scatter stores do not use the
11745          updated offset we set using ADVANCE.  Instead we have to make sure the
11746          reference in the data references point to the corresponding copy of
11747          the original in the epilogue.  Make sure to update both
11748          gather/scatters recognized by dataref analysis and also other
11749          refs that get_load_store_type classified as VMAT_GATHER_SCATTER.  */
11750       auto vstmt_vinfo = vect_stmt_to_vectorize (stmt_vinfo);
11751       if (STMT_VINFO_MEMORY_ACCESS_TYPE (vstmt_vinfo) == VMAT_GATHER_SCATTER
11752           || STMT_VINFO_GATHER_SCATTER_P (vstmt_vinfo))
11753         {
11754           DR_REF (dr)
11755             = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
11756                                      &find_in_mapping, &mapping);
11757           DR_BASE_ADDRESS (dr)
11758             = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
11759                                      &find_in_mapping, &mapping);
11760         }
11761       DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
11762       stmt_vinfo->dr_aux.stmt = stmt_vinfo;
11763       /* The vector size of the epilogue is smaller than that of the main loop
11764          so the alignment is either the same or lower. This means the dr will
11765          thus by definition be aligned.  */
11766       STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
11767     }
11768
11769   epilogue_vinfo->shared->datarefs_copy.release ();
11770   epilogue_vinfo->shared->save_datarefs ();
11771 }
11772
11773 /*  When vectorizing early break statements instructions that happen before
11774     the early break in the current BB need to be moved to after the early
11775     break.  This function deals with that and assumes that any validity
11776     checks has already been performed.
11777
11778     While moving the instructions if it encounters a VUSE or VDEF it then
11779     corrects the VUSES as it moves the statements along.  GDEST is the location
11780     in which to insert the new statements.  */
11781
11782 static void
11783 move_early_exit_stmts (loop_vec_info loop_vinfo)
11784 {
11785   DUMP_VECT_SCOPE ("move_early_exit_stmts");
11786
11787   if (LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).is_empty ())
11788     return;
11789
11790   /* Move all stmts that need moving.  */
11791   basic_block dest_bb = LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo);
11792   gimple_stmt_iterator dest_gsi = gsi_start_bb (dest_bb);
11793
11794   for (gimple *stmt : LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo))
11795     {
11796       /* Check to see if statement is still required for vect or has been
11797          elided.  */
11798       auto stmt_info = loop_vinfo->lookup_stmt (stmt);
11799       if (!stmt_info)
11800         continue;
11801
11802       if (dump_enabled_p ())
11803         dump_printf_loc (MSG_NOTE, vect_location, "moving stmt %G", stmt);
11804
11805       gimple_stmt_iterator stmt_gsi = gsi_for_stmt (stmt);
11806       gsi_move_before (&stmt_gsi, &dest_gsi);
11807       gsi_prev (&dest_gsi);
11808     }
11809
11810   /* Update all the stmts with their new reaching VUSES.  */
11811   tree vuse
11812     = gimple_vuse (LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).last ());
11813   for (auto p : LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo))
11814     {
11815       if (dump_enabled_p ())
11816           dump_printf_loc (MSG_NOTE, vect_location,
11817                            "updating vuse to %T for load %G", vuse, p);
11818       gimple_set_vuse (p, vuse);
11819       update_stmt (p);
11820     }
11821 }
11822
11823 /* Function vect_transform_loop.
11824
11825    The analysis phase has determined that the loop is vectorizable.
11826    Vectorize the loop - created vectorized stmts to replace the scalar
11827    stmts in the loop, and update the loop exit condition.
11828    Returns scalar epilogue loop if any.  */
11829
11830 class loop *
11831 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
11832 {
11833   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11834   class loop *epilogue = NULL;
11835   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
11836   int nbbs = loop->num_nodes;
11837   int i;
11838   tree niters_vector = NULL_TREE;
11839   tree step_vector = NULL_TREE;
11840   tree niters_vector_mult_vf = NULL_TREE;
11841   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11842   unsigned int lowest_vf = constant_lower_bound (vf);
11843   gimple *stmt;
11844   bool check_profitability = false;
11845   unsigned int th;
11846   bool flat = maybe_flat_loop_profile (loop);
11847
11848   DUMP_VECT_SCOPE ("vec_transform_loop");
11849
11850   loop_vinfo->shared->check_datarefs ();
11851
11852   /* Use the more conservative vectorization threshold.  If the number
11853      of iterations is constant assume the cost check has been performed
11854      by our caller.  If the threshold makes all loops profitable that
11855      run at least the (estimated) vectorization factor number of times
11856      checking is pointless, too.  */
11857   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
11858   if (vect_apply_runtime_profitability_check_p (loop_vinfo))
11859     {
11860       if (dump_enabled_p ())
11861         dump_printf_loc (MSG_NOTE, vect_location,
11862                          "Profitability threshold is %d loop iterations.\n",
11863                          th);
11864       check_profitability = true;
11865     }
11866
11867   /* Make sure there exists a single-predecessor exit bb.  Do this before
11868      versioning.   */
11869   edge e = LOOP_VINFO_IV_EXIT (loop_vinfo);
11870   if (! single_pred_p (e->dest) && !LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11871     {
11872       split_loop_exit_edge (e, true);
11873       if (dump_enabled_p ())
11874         dump_printf (MSG_NOTE, "split exit edge\n");
11875     }
11876
11877   /* Version the loop first, if required, so the profitability check
11878      comes first.  */
11879
11880   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
11881     {
11882       class loop *sloop
11883         = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
11884       sloop->force_vectorize = false;
11885       check_profitability = false;
11886     }
11887
11888   /* Make sure there exists a single-predecessor exit bb also on the
11889      scalar loop copy.  Do this after versioning but before peeling
11890      so CFG structure is fine for both scalar and if-converted loop
11891      to make slpeel_duplicate_current_defs_from_edges face matched
11892      loop closed PHI nodes on the exit.  */
11893   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
11894     {
11895       e = LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo);
11896       if (! single_pred_p (e->dest))
11897         {
11898           split_loop_exit_edge (e, true);
11899           if (dump_enabled_p ())
11900             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
11901         }
11902     }
11903
11904   tree niters = vect_build_loop_niters (loop_vinfo);
11905   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
11906   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
11907   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
11908   tree advance;
11909   drs_init_vec orig_drs_init;
11910
11911   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
11912                               &step_vector, &niters_vector_mult_vf, th,
11913                               check_profitability, niters_no_overflow,
11914                               &advance);
11915   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
11916       && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
11917     {
11918       /* Ifcvt duplicates loop preheader, loop body and produces an basic
11919          block after loop exit.  We need to scale all that.  */
11920       basic_block preheader
11921         = loop_preheader_edge (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->src;
11922       preheader->count
11923         = preheader->count.apply_probability
11924               (LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11925       scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
11926                               LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11927       single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->dest->count
11928         = preheader->count;
11929     }
11930
11931   if (niters_vector == NULL_TREE)
11932     {
11933       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
11934           && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
11935           && known_eq (lowest_vf, vf))
11936         {
11937           niters_vector
11938             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
11939                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
11940           step_vector = build_one_cst (TREE_TYPE (niters));
11941         }
11942       else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11943         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
11944                                      &step_vector, niters_no_overflow);
11945       else
11946         /* vect_do_peeling subtracted the number of peeled prologue
11947            iterations from LOOP_VINFO_NITERS.  */
11948         vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
11949                                      &niters_vector, &step_vector,
11950                                      niters_no_overflow);
11951     }
11952
11953   /* 1) Make sure the loop header has exactly two entries
11954      2) Make sure we have a preheader basic block.  */
11955
11956   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
11957
11958   split_edge (loop_preheader_edge (loop));
11959
11960   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11961     /* This will deal with any possible peeling.  */
11962     vect_prepare_for_masked_peels (loop_vinfo);
11963
11964   /* Handle any code motion that we need to for early-break vectorization after
11965      we've done peeling but just before we start vectorizing.  */
11966   if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11967     move_early_exit_stmts (loop_vinfo);
11968
11969   /* Schedule the SLP instances first, then handle loop vectorization
11970      below.  */
11971   if (!loop_vinfo->slp_instances.is_empty ())
11972     {
11973       DUMP_VECT_SCOPE ("scheduling SLP instances");
11974       vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
11975     }
11976
11977   /* FORNOW: the vectorizer supports only loops which body consist
11978      of one basic block (header + empty latch). When the vectorizer will
11979      support more involved loop forms, the order by which the BBs are
11980      traversed need to be reconsidered.  */
11981
11982   for (i = 0; i < nbbs; i++)
11983     {
11984       basic_block bb = bbs[i];
11985       stmt_vec_info stmt_info;
11986
11987       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
11988            gsi_next (&si))
11989         {
11990           gphi *phi = si.phi ();
11991           if (dump_enabled_p ())
11992             dump_printf_loc (MSG_NOTE, vect_location,
11993                              "------>vectorizing phi: %G", (gimple *) phi);
11994           stmt_info = loop_vinfo->lookup_stmt (phi);
11995           if (!stmt_info)
11996             continue;
11997
11998           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
11999             vect_loop_kill_debug_uses (loop, stmt_info);
12000
12001           if (!STMT_VINFO_RELEVANT_P (stmt_info)
12002               && !STMT_VINFO_LIVE_P (stmt_info))
12003             continue;
12004
12005           if (STMT_VINFO_VECTYPE (stmt_info)
12006               && (maybe_ne
12007                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
12008               && dump_enabled_p ())
12009             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
12010
12011           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
12012                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
12013                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
12014                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
12015                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence
12016                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
12017               && ! PURE_SLP_STMT (stmt_info))
12018             {
12019               if (dump_enabled_p ())
12020                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
12021               vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
12022             }
12023         }
12024
12025       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
12026            gsi_next (&si))
12027         {
12028           gphi *phi = si.phi ();
12029           stmt_info = loop_vinfo->lookup_stmt (phi);
12030           if (!stmt_info)
12031             continue;
12032
12033           if (!STMT_VINFO_RELEVANT_P (stmt_info)
12034               && !STMT_VINFO_LIVE_P (stmt_info))
12035             continue;
12036
12037           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
12038                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
12039                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
12040                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
12041                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
12042                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence)
12043               && ! PURE_SLP_STMT (stmt_info))
12044             maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
12045         }
12046
12047       for (gimple_stmt_iterator si = gsi_start_bb (bb);
12048            !gsi_end_p (si);)
12049         {
12050           stmt = gsi_stmt (si);
12051           /* During vectorization remove existing clobber stmts.  */
12052           if (gimple_clobber_p (stmt))
12053             {
12054               unlink_stmt_vdef (stmt);
12055               gsi_remove (&si, true);
12056               release_defs (stmt);
12057             }
12058           else
12059             {
12060               /* Ignore vector stmts created in the outer loop.  */
12061               stmt_info = loop_vinfo->lookup_stmt (stmt);
12062
12063               /* vector stmts created in the outer-loop during vectorization of
12064                  stmts in an inner-loop may not have a stmt_info, and do not
12065                  need to be vectorized.  */
12066               stmt_vec_info seen_store = NULL;
12067               if (stmt_info)
12068                 {
12069                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
12070                     {
12071                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
12072                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
12073                            !gsi_end_p (subsi); gsi_next (&subsi))
12074                         {
12075                           stmt_vec_info pat_stmt_info
12076                             = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
12077                           vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
12078                                                     &si, &seen_store);
12079                         }
12080                       stmt_vec_info pat_stmt_info
12081                         = STMT_VINFO_RELATED_STMT (stmt_info);
12082                       if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
12083                                                     &si, &seen_store))
12084                         maybe_set_vectorized_backedge_value (loop_vinfo,
12085                                                              pat_stmt_info);
12086                     }
12087                   else
12088                     {
12089                       if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
12090                                                     &seen_store))
12091                         maybe_set_vectorized_backedge_value (loop_vinfo,
12092                                                              stmt_info);
12093                     }
12094                 }
12095               gsi_next (&si);
12096               if (seen_store)
12097                 {
12098                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
12099                     /* Interleaving.  If IS_STORE is TRUE, the
12100                        vectorization of the interleaving chain was
12101                        completed - free all the stores in the chain.  */
12102                     vect_remove_stores (loop_vinfo,
12103                                         DR_GROUP_FIRST_ELEMENT (seen_store));
12104                   else
12105                     /* Free the attached stmt_vec_info and remove the stmt.  */
12106                     loop_vinfo->remove_stmt (stmt_info);
12107                 }
12108             }
12109         }
12110
12111       /* Stub out scalar statements that must not survive vectorization.
12112          Doing this here helps with grouped statements, or statements that
12113          are involved in patterns.  */
12114       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
12115            !gsi_end_p (gsi); gsi_next (&gsi))
12116         {
12117           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
12118           if (!call || !gimple_call_internal_p (call))
12119             continue;
12120           internal_fn ifn = gimple_call_internal_fn (call);
12121           if (ifn == IFN_MASK_LOAD)
12122             {
12123               tree lhs = gimple_get_lhs (call);
12124               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12125                 {
12126                   tree zero = build_zero_cst (TREE_TYPE (lhs));
12127                   gimple *new_stmt = gimple_build_assign (lhs, zero);
12128                   gsi_replace (&gsi, new_stmt, true);
12129                 }
12130             }
12131           else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
12132             {
12133               tree lhs = gimple_get_lhs (call);
12134               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12135                 {
12136                   tree else_arg
12137                     = gimple_call_arg (call, gimple_call_num_args (call) - 1);
12138                   gimple *new_stmt = gimple_build_assign (lhs, else_arg);
12139                   gsi_replace (&gsi, new_stmt, true);
12140                 }
12141             }
12142         }
12143     }                           /* BBs in loop */
12144
12145   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
12146      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
12147   if (integer_onep (step_vector))
12148     niters_no_overflow = true;
12149   vect_set_loop_condition (loop, LOOP_VINFO_IV_EXIT (loop_vinfo), loop_vinfo,
12150                            niters_vector, step_vector, niters_vector_mult_vf,
12151                            !niters_no_overflow);
12152
12153   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
12154
12155   /* True if the final iteration might not handle a full vector's
12156      worth of scalar iterations.  */
12157   bool final_iter_may_be_partial
12158     = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
12159   /* The minimum number of iterations performed by the epilogue.  This
12160      is 1 when peeling for gaps because we always need a final scalar
12161      iteration.  */
12162   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
12163   /* +1 to convert latch counts to loop iteration counts,
12164      -min_epilogue_iters to remove iterations that cannot be performed
12165        by the vector code.  */
12166   int bias_for_lowest = 1 - min_epilogue_iters;
12167   int bias_for_assumed = bias_for_lowest;
12168   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
12169   if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
12170     {
12171       /* When the amount of peeling is known at compile time, the first
12172          iteration will have exactly alignment_npeels active elements.
12173          In the worst case it will have at least one.  */
12174       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
12175       bias_for_lowest += lowest_vf - min_first_active;
12176       bias_for_assumed += assumed_vf - min_first_active;
12177     }
12178   /* In these calculations the "- 1" converts loop iteration counts
12179      back to latch counts.  */
12180   if (loop->any_upper_bound)
12181     {
12182       loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
12183       loop->nb_iterations_upper_bound
12184         = (final_iter_may_be_partial
12185            ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
12186                             lowest_vf) - 1
12187            : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
12188                              lowest_vf) - 1);
12189       if (main_vinfo
12190           /* Both peeling for alignment and peeling for gaps can end up
12191              with the scalar epilogue running for more than VF-1 iterations.  */
12192           && !main_vinfo->peeling_for_alignment
12193           && !main_vinfo->peeling_for_gaps)
12194         {
12195           unsigned int bound;
12196           poly_uint64 main_iters
12197             = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
12198                            LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
12199           main_iters
12200             = upper_bound (main_iters,
12201                            LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
12202           if (can_div_away_from_zero_p (main_iters,
12203                                         LOOP_VINFO_VECT_FACTOR (loop_vinfo),
12204                                         &bound))
12205             loop->nb_iterations_upper_bound
12206               = wi::umin ((bound_wide_int) (bound - 1),
12207                           loop->nb_iterations_upper_bound);
12208       }
12209   }
12210   if (loop->any_likely_upper_bound)
12211     loop->nb_iterations_likely_upper_bound
12212       = (final_iter_may_be_partial
12213          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
12214                           + bias_for_lowest, lowest_vf) - 1
12215          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
12216                            + bias_for_lowest, lowest_vf) - 1);
12217   if (loop->any_estimate)
12218     loop->nb_iterations_estimate
12219       = (final_iter_may_be_partial
12220          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
12221                           assumed_vf) - 1
12222          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
12223                            assumed_vf) - 1);
12224   scale_profile_for_vect_loop (loop, LOOP_VINFO_IV_EXIT (loop_vinfo),
12225                                assumed_vf, flat);
12226
12227   if (dump_enabled_p ())
12228     {
12229       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
12230         {
12231           dump_printf_loc (MSG_NOTE, vect_location,
12232                            "LOOP VECTORIZED\n");
12233           if (loop->inner)
12234             dump_printf_loc (MSG_NOTE, vect_location,
12235                              "OUTER LOOP VECTORIZED\n");
12236           dump_printf (MSG_NOTE, "\n");
12237         }
12238       else
12239         dump_printf_loc (MSG_NOTE, vect_location,
12240                          "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
12241                          GET_MODE_NAME (loop_vinfo->vector_mode));
12242     }
12243
12244   /* Loops vectorized with a variable factor won't benefit from
12245      unrolling/peeling.  */
12246   if (!vf.is_constant ())
12247     {
12248       loop->unroll = 1;
12249       if (dump_enabled_p ())
12250         dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
12251                          " variable-length vectorization factor\n");
12252     }
12253   /* Free SLP instances here because otherwise stmt reference counting
12254      won't work.  */
12255   slp_instance instance;
12256   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
12257     vect_free_slp_instance (instance);
12258   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
12259   /* Clear-up safelen field since its value is invalid after vectorization
12260      since vectorized loop can have loop-carried dependencies.  */
12261   loop->safelen = 0;
12262
12263   if (epilogue)
12264     {
12265       update_epilogue_loop_vinfo (epilogue, advance);
12266
12267       epilogue->simduid = loop->simduid;
12268       epilogue->force_vectorize = loop->force_vectorize;
12269       epilogue->dont_vectorize = false;
12270     }
12271
12272   return epilogue;
12273 }
12274
12275 /* The code below is trying to perform simple optimization - revert
12276    if-conversion for masked stores, i.e. if the mask of a store is zero
12277    do not perform it and all stored value producers also if possible.
12278    For example,
12279      for (i=0; i<n; i++)
12280        if (c[i])
12281         {
12282           p1[i] += 1;
12283           p2[i] = p3[i] +2;
12284         }
12285    this transformation will produce the following semi-hammock:
12286
12287    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
12288      {
12289        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
12290        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
12291        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
12292        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
12293        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
12294        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
12295      }
12296 */
12297
12298 void
12299 optimize_mask_stores (class loop *loop)
12300 {
12301   basic_block *bbs = get_loop_body (loop);
12302   unsigned nbbs = loop->num_nodes;
12303   unsigned i;
12304   basic_block bb;
12305   class loop *bb_loop;
12306   gimple_stmt_iterator gsi;
12307   gimple *stmt;
12308   auto_vec<gimple *> worklist;
12309   auto_purge_vect_location sentinel;
12310
12311   vect_location = find_loop_location (loop);
12312   /* Pick up all masked stores in loop if any.  */
12313   for (i = 0; i < nbbs; i++)
12314     {
12315       bb = bbs[i];
12316       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
12317            gsi_next (&gsi))
12318         {
12319           stmt = gsi_stmt (gsi);
12320           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
12321             worklist.safe_push (stmt);
12322         }
12323     }
12324
12325   free (bbs);
12326   if (worklist.is_empty ())
12327     return;
12328
12329   /* Loop has masked stores.  */
12330   while (!worklist.is_empty ())
12331     {
12332       gimple *last, *last_store;
12333       edge e, efalse;
12334       tree mask;
12335       basic_block store_bb, join_bb;
12336       gimple_stmt_iterator gsi_to;
12337       tree vdef, new_vdef;
12338       gphi *phi;
12339       tree vectype;
12340       tree zero;
12341
12342       last = worklist.pop ();
12343       mask = gimple_call_arg (last, 2);
12344       bb = gimple_bb (last);
12345       /* Create then_bb and if-then structure in CFG, then_bb belongs to
12346          the same loop as if_bb.  It could be different to LOOP when two
12347          level loop-nest is vectorized and mask_store belongs to the inner
12348          one.  */
12349       e = split_block (bb, last);
12350       bb_loop = bb->loop_father;
12351       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
12352       join_bb = e->dest;
12353       store_bb = create_empty_bb (bb);
12354       add_bb_to_loop (store_bb, bb_loop);
12355       e->flags = EDGE_TRUE_VALUE;
12356       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
12357       /* Put STORE_BB to likely part.  */
12358       efalse->probability = profile_probability::likely ();
12359       e->probability = efalse->probability.invert ();
12360       store_bb->count = efalse->count ();
12361       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
12362       if (dom_info_available_p (CDI_DOMINATORS))
12363         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
12364       if (dump_enabled_p ())
12365         dump_printf_loc (MSG_NOTE, vect_location,
12366                          "Create new block %d to sink mask stores.",
12367                          store_bb->index);
12368       /* Create vector comparison with boolean result.  */
12369       vectype = TREE_TYPE (mask);
12370       zero = build_zero_cst (vectype);
12371       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
12372       gsi = gsi_last_bb (bb);
12373       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
12374       /* Create new PHI node for vdef of the last masked store:
12375          .MEM_2 = VDEF <.MEM_1>
12376          will be converted to
12377          .MEM.3 = VDEF <.MEM_1>
12378          and new PHI node will be created in join bb
12379          .MEM_2 = PHI <.MEM_1, .MEM_3>
12380       */
12381       vdef = gimple_vdef (last);
12382       new_vdef = make_ssa_name (gimple_vop (cfun), last);
12383       gimple_set_vdef (last, new_vdef);
12384       phi = create_phi_node (vdef, join_bb);
12385       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
12386
12387       /* Put all masked stores with the same mask to STORE_BB if possible.  */
12388       while (true)
12389         {
12390           gimple_stmt_iterator gsi_from;
12391           gimple *stmt1 = NULL;
12392
12393           /* Move masked store to STORE_BB.  */
12394           last_store = last;
12395           gsi = gsi_for_stmt (last);
12396           gsi_from = gsi;
12397           /* Shift GSI to the previous stmt for further traversal.  */
12398           gsi_prev (&gsi);
12399           gsi_to = gsi_start_bb (store_bb);
12400           gsi_move_before (&gsi_from, &gsi_to);
12401           /* Setup GSI_TO to the non-empty block start.  */
12402           gsi_to = gsi_start_bb (store_bb);
12403           if (dump_enabled_p ())
12404             dump_printf_loc (MSG_NOTE, vect_location,
12405                              "Move stmt to created bb\n%G", last);
12406           /* Move all stored value producers if possible.  */
12407           while (!gsi_end_p (gsi))
12408             {
12409               tree lhs;
12410               imm_use_iterator imm_iter;
12411               use_operand_p use_p;
12412               bool res;
12413
12414               /* Skip debug statements.  */
12415               if (is_gimple_debug (gsi_stmt (gsi)))
12416                 {
12417                   gsi_prev (&gsi);
12418                   continue;
12419                 }
12420               stmt1 = gsi_stmt (gsi);
12421               /* Do not consider statements writing to memory or having
12422                  volatile operand.  */
12423               if (gimple_vdef (stmt1)
12424                   || gimple_has_volatile_ops (stmt1))
12425                 break;
12426               gsi_from = gsi;
12427               gsi_prev (&gsi);
12428               lhs = gimple_get_lhs (stmt1);
12429               if (!lhs)
12430                 break;
12431
12432               /* LHS of vectorized stmt must be SSA_NAME.  */
12433               if (TREE_CODE (lhs) != SSA_NAME)
12434                 break;
12435
12436               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12437                 {
12438                   /* Remove dead scalar statement.  */
12439                   if (has_zero_uses (lhs))
12440                     {
12441                       gsi_remove (&gsi_from, true);
12442                       continue;
12443                     }
12444                 }
12445
12446               /* Check that LHS does not have uses outside of STORE_BB.  */
12447               res = true;
12448               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
12449                 {
12450                   gimple *use_stmt;
12451                   use_stmt = USE_STMT (use_p);
12452                   if (is_gimple_debug (use_stmt))
12453                     continue;
12454                   if (gimple_bb (use_stmt) != store_bb)
12455                     {
12456                       res = false;
12457                       break;
12458                     }
12459                 }
12460               if (!res)
12461                 break;
12462
12463               if (gimple_vuse (stmt1)
12464                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
12465                 break;
12466
12467               /* Can move STMT1 to STORE_BB.  */
12468               if (dump_enabled_p ())
12469                 dump_printf_loc (MSG_NOTE, vect_location,
12470                                  "Move stmt to created bb\n%G", stmt1);
12471               gsi_move_before (&gsi_from, &gsi_to);
12472               /* Shift GSI_TO for further insertion.  */
12473               gsi_prev (&gsi_to);
12474             }
12475           /* Put other masked stores with the same mask to STORE_BB.  */
12476           if (worklist.is_empty ()
12477               || gimple_call_arg (worklist.last (), 2) != mask
12478               || worklist.last () != stmt1)
12479             break;
12480           last = worklist.pop ();
12481         }
12482       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
12483     }
12484 }
12485
12486 /* Decide whether it is possible to use a zero-based induction variable
12487    when vectorizing LOOP_VINFO with partial vectors.  If it is, return
12488    the value that the induction variable must be able to hold in order
12489    to ensure that the rgroups eventually have no active vector elements.
12490    Return -1 otherwise.  */
12491
12492 widest_int
12493 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
12494 {
12495   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
12496   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
12497   unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
12498
12499   /* Calculate the value that the induction variable must be able
12500      to hit in order to ensure that we end the loop with an all-false mask.
12501      This involves adding the maximum number of inactive trailing scalar
12502      iterations.  */
12503   widest_int iv_limit = -1;
12504   if (max_loop_iterations (loop, &iv_limit))
12505     {
12506       if (niters_skip)
12507         {
12508           /* Add the maximum number of skipped iterations to the
12509              maximum iteration count.  */
12510           if (TREE_CODE (niters_skip) == INTEGER_CST)
12511             iv_limit += wi::to_widest (niters_skip);
12512           else
12513             iv_limit += max_vf - 1;
12514         }
12515       else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
12516         /* Make a conservatively-correct assumption.  */
12517         iv_limit += max_vf - 1;
12518
12519       /* IV_LIMIT is the maximum number of latch iterations, which is also
12520          the maximum in-range IV value.  Round this value down to the previous
12521          vector alignment boundary and then add an extra full iteration.  */
12522       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
12523       iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
12524     }
12525   return iv_limit;
12526 }
12527
12528 /* For the given rgroup_controls RGC, check whether an induction variable
12529    would ever hit a value that produces a set of all-false masks or zero
12530    lengths before wrapping around.  Return true if it's possible to wrap
12531    around before hitting the desirable value, otherwise return false.  */
12532
12533 bool
12534 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
12535 {
12536   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
12537
12538   if (iv_limit == -1)
12539     return true;
12540
12541   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
12542   unsigned int compare_precision = TYPE_PRECISION (compare_type);
12543   unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
12544
12545   if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
12546     return true;
12547
12548   return false;
12549 }