gcc/tree-vect-loop.cc

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2024 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #define INCLUDE_ALGORITHM
  23 #include "config.h"
  24 #include "system.h"
  25 #include "coretypes.h"
  26 #include "backend.h"
  27 #include "target.h"
  28 #include "rtl.h"
  29 #include "tree.h"
  30 #include "gimple.h"
  31 #include "cfghooks.h"
  32 #include "tree-pass.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "memmodel.h"
  36 #include "optabs.h"
  37 #include "diagnostic-core.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "cfganal.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop-niter.h"
  47 #include "tree-ssa-loop.h"
  48 #include "cfgloop.h"
  49 #include "tree-scalar-evolution.h"
  50 #include "tree-vectorizer.h"
  51 #include "gimple-fold.h"
  52 #include "cgraph.h"
  53 #include "tree-cfg.h"
  54 #include "tree-if-conv.h"
  55 #include "internal-fn.h"
  56 #include "tree-vector-builder.h"
  57 #include "vec-perm-indices.h"
  58 #include "tree-eh.h"
  59 #include "case-cfn-macros.h"
  60 #include "langhooks.h"
  61
  62 /* Loop Vectorization Pass.
  63
  64    This pass tries to vectorize loops.
  65
  66    For example, the vectorizer transforms the following simple loop:
  67
  68         short a[N]; short b[N]; short c[N]; int i;
  69
  70         for (i=0; i<N; i++){
  71           a[i] = b[i] + c[i];
  72         }
  73
  74    as if it was manually vectorized by rewriting the source code into:
  75
  76         typedef int __attribute__((mode(V8HI))) v8hi;
  77         short a[N];  short b[N]; short c[N];   int i;
  78         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  79         v8hi va, vb, vc;
  80
  81         for (i=0; i<N/8; i++){
  82           vb = pb[i];
  83           vc = pc[i];
  84           va = vb + vc;
  85           pa[i] = va;
  86         }
  87
  88         The main entry to this pass is vectorize_loops(), in which
  89    the vectorizer applies a set of analyses on a given set of loops,
  90    followed by the actual vectorization transformation for the loops that
  91    had successfully passed the analysis phase.
  92         Throughout this pass we make a distinction between two types of
  93    data: scalars (which are represented by SSA_NAMES), and memory references
  94    ("data-refs").  These two types of data require different handling both
  95    during analysis and transformation. The types of data-refs that the
  96    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  97    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  98    accesses are required to have a simple (consecutive) access pattern.
  99
 100    Analysis phase:
 101    ===============
 102         The driver for the analysis phase is vect_analyze_loop().
 103    It applies a set of analyses, some of which rely on the scalar evolution
 104    analyzer (scev) developed by Sebastian Pop.
 105
 106         During the analysis phase the vectorizer records some information
 107    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 108    loop, as well as general information about the loop as a whole, which is
 109    recorded in a "loop_vec_info" struct attached to each loop.
 110
 111    Transformation phase:
 112    =====================
 113         The loop transformation phase scans all the stmts in the loop, and
 114    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 115    the loop that needs to be vectorized.  It inserts the vector code sequence
 116    just before the scalar stmt S, and records a pointer to the vector code
 117    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 118    attached to S).  This pointer will be used for the vectorization of following
 119    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 120    otherwise, we rely on dead code elimination for removing it.
 121
 122         For example, say stmt S1 was vectorized into stmt VS1:
 123
 124    VS1: vb = px[i];
 125    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 126    S2:  a = b;
 127
 128    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 129    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 130    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 131    resulting sequence would be:
 132
 133    VS1: vb = px[i];
 134    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 135    VS2: va = vb;
 136    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 137
 138         Operands that are not SSA_NAMEs, are data-refs that appear in
 139    load/store operations (like 'x[i]' in S1), and are handled differently.
 140
 141    Target modeling:
 142    =================
 143         Currently the only target specific information that is used is the
 144    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 145    Targets that can support different sizes of vectors, for now will need
 146    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 147    flexibility will be added in the future.
 148
 149         Since we only vectorize operations which vector form can be
 150    expressed using existing tree codes, to verify that an operation is
 151    supported, the vectorizer checks the relevant optab at the relevant
 152    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 153    the value found is CODE_FOR_nothing, then there's no target support, and
 154    we can't vectorize the stmt.
 155
 156    For additional information on this project see:
 157    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 158 */
 159
 160 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
 161                                                 unsigned *);
 162 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
 163                                                bool *, bool *, bool);
 164
 165 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 166    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 167    may already be set for general statements (not just data refs).  */
 168
 169 static opt_result
 170 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
 171                               bool vectype_maybe_set_p,
 172                               poly_uint64 *vf)
 173 {
 174   gimple *stmt = stmt_info->stmt;
 175
 176   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 177        && !STMT_VINFO_LIVE_P (stmt_info))
 178       || gimple_clobber_p (stmt))
 179     {
 180       if (dump_enabled_p ())
 181         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 182       return opt_result::success ();
 183     }
 184
 185   tree stmt_vectype, nunits_vectype;
 186   opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
 187                                                    &stmt_vectype,
 188                                                    &nunits_vectype);
 189   if (!res)
 190     return res;
 191
 192   if (stmt_vectype)
 193     {
 194       if (STMT_VINFO_VECTYPE (stmt_info))
 195         /* The only case when a vectype had been already set is for stmts
 196            that contain a data ref, or for "pattern-stmts" (stmts generated
 197            by the vectorizer to represent/replace a certain idiom).  */
 198         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 199                      || vectype_maybe_set_p)
 200                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 201       else
 202         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 203     }
 204
 205   if (nunits_vectype)
 206     vect_update_max_nunits (vf, nunits_vectype);
 207
 208   return opt_result::success ();
 209 }
 210
 211 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 212    types of STMT_INFO and all attached pattern statements and update
 213    the vectorization factor VF accordingly.  Return true on success
 214    or false if something prevented vectorization.  */
 215
 216 static opt_result
 217 vect_determine_vf_for_stmt (vec_info *vinfo,
 218                             stmt_vec_info stmt_info, poly_uint64 *vf)
 219 {
 220   if (dump_enabled_p ())
 221     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
 222                      stmt_info->stmt);
 223   opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
 224   if (!res)
 225     return res;
 226
 227   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 228       && STMT_VINFO_RELATED_STMT (stmt_info))
 229     {
 230       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 231       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
 232
 233       /* If a pattern statement has def stmts, analyze them too.  */
 234       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 235            !gsi_end_p (si); gsi_next (&si))
 236         {
 237           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 238           if (dump_enabled_p ())
 239             dump_printf_loc (MSG_NOTE, vect_location,
 240                              "==> examining pattern def stmt: %G",
 241                              def_stmt_info->stmt);
 242           res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
 243           if (!res)
 244             return res;
 245         }
 246
 247       if (dump_enabled_p ())
 248         dump_printf_loc (MSG_NOTE, vect_location,
 249                          "==> examining pattern statement: %G",
 250                          stmt_info->stmt);
 251       res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
 252       if (!res)
 253         return res;
 254     }
 255
 256   return opt_result::success ();
 257 }
 258
 259 /* Function vect_determine_vectorization_factor
 260
 261    Determine the vectorization factor (VF).  VF is the number of data elements
 262    that are operated upon in parallel in a single iteration of the vectorized
 263    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 264    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 265    elements can fit in a single vector register.
 266
 267    We currently support vectorization of loops in which all types operated upon
 268    are of the same size.  Therefore this function currently sets VF according to
 269    the size of the types operated upon, and fails if there are multiple sizes
 270    in the loop.
 271
 272    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 273    original loop:
 274         for (i=0; i<N; i++){
 275           a[i] = b[i] + c[i];
 276         }
 277
 278    vectorized loop:
 279         for (i=0; i<N; i+=VF){
 280           a[i:VF] = b[i:VF] + c[i:VF];
 281         }
 282 */
 283
 284 static opt_result
 285 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 286 {
 287   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 288   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 289   unsigned nbbs = loop->num_nodes;
 290   poly_uint64 vectorization_factor = 1;
 291   tree scalar_type = NULL_TREE;
 292   gphi *phi;
 293   tree vectype;
 294   stmt_vec_info stmt_info;
 295   unsigned i;
 296
 297   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 298
 299   for (i = 0; i < nbbs; i++)
 300     {
 301       basic_block bb = bbs[i];
 302
 303       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 304            gsi_next (&si))
 305         {
 306           phi = si.phi ();
 307           stmt_info = loop_vinfo->lookup_stmt (phi);
 308           if (dump_enabled_p ())
 309             dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
 310                              (gimple *) phi);
 311
 312           gcc_assert (stmt_info);
 313
 314           if (STMT_VINFO_RELEVANT_P (stmt_info)
 315               || STMT_VINFO_LIVE_P (stmt_info))
 316             {
 317               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 318               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 319
 320               if (dump_enabled_p ())
 321                 dump_printf_loc (MSG_NOTE, vect_location,
 322                                  "get vectype for scalar type:  %T\n",
 323                                  scalar_type);
 324
 325               vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 326               if (!vectype)
 327                 return opt_result::failure_at (phi,
 328                                                "not vectorized: unsupported "
 329                                                "data-type %T\n",
 330                                                scalar_type);
 331               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 332
 333               if (dump_enabled_p ())
 334                 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
 335                                  vectype);
 336
 337               if (dump_enabled_p ())
 338                 {
 339                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 340                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 341                   dump_printf (MSG_NOTE, "\n");
 342                 }
 343
 344               vect_update_max_nunits (&vectorization_factor, vectype);
 345             }
 346         }
 347
 348       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 349            gsi_next (&si))
 350         {
 351           if (is_gimple_debug (gsi_stmt (si)))
 352             continue;
 353           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 354           opt_result res
 355             = vect_determine_vf_for_stmt (loop_vinfo,
 356                                           stmt_info, &vectorization_factor);
 357           if (!res)
 358             return res;
 359         }
 360     }
 361
 362   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 363   if (dump_enabled_p ())
 364     {
 365       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 366       dump_dec (MSG_NOTE, vectorization_factor);
 367       dump_printf (MSG_NOTE, "\n");
 368     }
 369
 370   if (known_le (vectorization_factor, 1U))
 371     return opt_result::failure_at (vect_location,
 372                                    "not vectorized: unsupported data-type\n");
 373   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 374   return opt_result::success ();
 375 }
 376
 377
 378 /* Function vect_is_simple_iv_evolution.
 379
 380    FORNOW: A simple evolution of an induction variables in the loop is
 381    considered a polynomial evolution.  */
 382
 383 static bool
 384 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 385                              tree * step)
 386 {
 387   tree init_expr;
 388   tree step_expr;
 389   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 390   basic_block bb;
 391
 392   /* When there is no evolution in this loop, the evolution function
 393      is not "simple".  */
 394   if (evolution_part == NULL_TREE)
 395     return false;
 396
 397   /* When the evolution is a polynomial of degree >= 2
 398      the evolution function is not "simple".  */
 399   if (tree_is_chrec (evolution_part))
 400     return false;
 401
 402   step_expr = evolution_part;
 403   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 404
 405   if (dump_enabled_p ())
 406     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
 407                      step_expr, init_expr);
 408
 409   *init = init_expr;
 410   *step = step_expr;
 411
 412   if (TREE_CODE (step_expr) != INTEGER_CST
 413       && (TREE_CODE (step_expr) != SSA_NAME
 414           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 415               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 416           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 417               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 418                   || !flag_associative_math)))
 419       && (TREE_CODE (step_expr) != REAL_CST
 420           || !flag_associative_math))
 421     {
 422       if (dump_enabled_p ())
 423         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 424                          "step unknown.\n");
 425       return false;
 426     }
 427
 428   return true;
 429 }
 430
 431 /* Function vect_is_nonlinear_iv_evolution
 432
 433    Only support nonlinear induction for integer type
 434    1. neg
 435    2. mul by constant
 436    3. lshift/rshift by constant.
 437
 438    For neg induction, return a fake step as integer -1.  */
 439 static bool
 440 vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
 441                                 gphi* loop_phi_node, tree *init, tree *step)
 442 {
 443   tree init_expr, ev_expr, result, op1, op2;
 444   gimple* def;
 445
 446   if (gimple_phi_num_args (loop_phi_node) != 2)
 447     return false;
 448
 449   init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop));
 450   ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop));
 451
 452   /* Support nonlinear induction only for integer type.  */
 453   if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
 454     return false;
 455
 456   *init = init_expr;
 457   result = PHI_RESULT (loop_phi_node);
 458
 459   if (TREE_CODE (ev_expr) != SSA_NAME
 460       || ((def = SSA_NAME_DEF_STMT (ev_expr)), false)
 461       || !is_gimple_assign (def))
 462     return false;
 463
 464   enum tree_code t_code = gimple_assign_rhs_code (def);
 465   switch (t_code)
 466     {
 467     case NEGATE_EXPR:
 468       if (gimple_assign_rhs1 (def) != result)
 469         return false;
 470       *step = build_int_cst (TREE_TYPE (init_expr), -1);
 471       STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
 472       break;
 473
 474     case RSHIFT_EXPR:
 475     case LSHIFT_EXPR:
 476     case MULT_EXPR:
 477       op1 = gimple_assign_rhs1 (def);
 478       op2 = gimple_assign_rhs2 (def);
 479       if (TREE_CODE (op2) != INTEGER_CST
 480           || op1 != result)
 481         return false;
 482       *step = op2;
 483       if (t_code == LSHIFT_EXPR)
 484         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
 485       else if (t_code == RSHIFT_EXPR)
 486         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
 487       /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul.  */
 488       else
 489         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
 490       break;
 491
 492     default:
 493       return false;
 494     }
 495
 496   STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = *init;
 497   STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = *step;
 498
 499   return true;
 500 }
 501
 502 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
 503    what we are assuming is a double reduction.  For example, given
 504    a structure like this:
 505
 506       outer1:
 507         x_1 = PHI <x_4(outer2), ...>;
 508         ...
 509
 510       inner:
 511         x_2 = PHI <x_1(outer1), ...>;
 512         ...
 513         x_3 = ...;
 514         ...
 515
 516       outer2:
 517         x_4 = PHI <x_3(inner)>;
 518         ...
 519
 520    outer loop analysis would treat x_1 as a double reduction phi and
 521    this function would then return true for x_2.  */
 522
 523 static bool
 524 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
 525 {
 526   use_operand_p use_p;
 527   ssa_op_iter op_iter;
 528   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
 529     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
 530       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
 531         return true;
 532   return false;
 533 }
 534
 535 /* Returns true if Phi is a first-order recurrence. A first-order
 536    recurrence is a non-reduction recurrence relation in which the value of
 537    the recurrence in the current loop iteration equals a value defined in
 538    the previous iteration.  */
 539
 540 static bool
 541 vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop,
 542                                    gphi *phi)
 543 {
 544   /* A nested cycle isn't vectorizable as first order recurrence.  */
 545   if (LOOP_VINFO_LOOP (loop_vinfo) != loop)
 546     return false;
 547
 548   /* Ensure the loop latch definition is from within the loop.  */
 549   edge latch = loop_latch_edge (loop);
 550   tree ldef = PHI_ARG_DEF_FROM_EDGE (phi, latch);
 551   if (TREE_CODE (ldef) != SSA_NAME
 552       || SSA_NAME_IS_DEFAULT_DEF (ldef)
 553       || is_a <gphi *> (SSA_NAME_DEF_STMT (ldef))
 554       || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (ldef))))
 555     return false;
 556
 557   tree def = gimple_phi_result (phi);
 558
 559   /* Ensure every use_stmt of the phi node is dominated by the latch
 560      definition.  */
 561   imm_use_iterator imm_iter;
 562   use_operand_p use_p;
 563   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
 564     if (!is_gimple_debug (USE_STMT (use_p))
 565         && (SSA_NAME_DEF_STMT (ldef) == USE_STMT (use_p)
 566             || !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (ldef),
 567                                             USE_STMT (use_p))))
 568       return false;
 569
 570   /* First-order recurrence autovectorization needs shuffle vector.  */
 571   tree scalar_type = TREE_TYPE (def);
 572   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 573   if (!vectype)
 574     return false;
 575
 576   return true;
 577 }
 578
 579 /* Function vect_analyze_scalar_cycles_1.
 580
 581    Examine the cross iteration def-use cycles of scalar variables
 582    in LOOP.  LOOP_VINFO represents the loop that is now being
 583    considered for vectorization (can be LOOP, or an outer-loop
 584    enclosing LOOP).  SLP indicates there will be some subsequent
 585    slp analyses or not.  */
 586
 587 static void
 588 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop,
 589                               bool slp)
 590 {
 591   basic_block bb = loop->header;
 592   tree init, step;
 593   auto_vec<stmt_vec_info, 64> worklist;
 594   gphi_iterator gsi;
 595   bool double_reduc, reduc_chain;
 596
 597   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 598
 599   /* First - identify all inductions.  Reduction detection assumes that all the
 600      inductions have been identified, therefore, this order must not be
 601      changed.  */
 602   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 603     {
 604       gphi *phi = gsi.phi ();
 605       tree access_fn = NULL;
 606       tree def = PHI_RESULT (phi);
 607       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 608
 609       if (dump_enabled_p ())
 610         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
 611                          (gimple *) phi);
 612
 613       /* Skip virtual phi's.  The data dependences that are associated with
 614          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 615       if (virtual_operand_p (def))
 616         continue;
 617
 618       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 619
 620       /* Analyze the evolution function.  */
 621       access_fn = analyze_scalar_evolution (loop, def);
 622       if (access_fn)
 623         {
 624           STRIP_NOPS (access_fn);
 625           if (dump_enabled_p ())
 626             dump_printf_loc (MSG_NOTE, vect_location,
 627                              "Access function of PHI: %T\n", access_fn);
 628           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 629             = initial_condition_in_loop_num (access_fn, loop->num);
 630           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 631             = evolution_part_in_loop_num (access_fn, loop->num);
 632         }
 633
 634       if ((!access_fn
 635            || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
 636            || !vect_is_simple_iv_evolution (loop->num, access_fn,
 637                                             &init, &step)
 638            || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 639                && TREE_CODE (step) != INTEGER_CST))
 640           /* Only handle nonlinear iv for same loop.  */
 641           && (LOOP_VINFO_LOOP (loop_vinfo) != loop
 642               || !vect_is_nonlinear_iv_evolution (loop, stmt_vinfo,
 643                                                   phi, &init, &step)))
 644         {
 645           worklist.safe_push (stmt_vinfo);
 646           continue;
 647         }
 648
 649       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 650                   != NULL_TREE);
 651       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 652
 653       if (dump_enabled_p ())
 654         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 655       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 656     }
 657
 658
 659   /* Second - identify all reductions and nested cycles.  */
 660   while (worklist.length () > 0)
 661     {
 662       stmt_vec_info stmt_vinfo = worklist.pop ();
 663       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
 664       tree def = PHI_RESULT (phi);
 665
 666       if (dump_enabled_p ())
 667         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
 668                          (gimple *) phi);
 669
 670       gcc_assert (!virtual_operand_p (def)
 671                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 672
 673       stmt_vec_info reduc_stmt_info
 674         = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
 675                                     &reduc_chain, slp);
 676       if (reduc_stmt_info)
 677         {
 678           STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
 679           STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
 680           if (double_reduc)
 681             {
 682               if (dump_enabled_p ())
 683                 dump_printf_loc (MSG_NOTE, vect_location,
 684                                  "Detected double reduction.\n");
 685
 686               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 687               STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
 688             }
 689           else
 690             {
 691               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 692                 {
 693                   if (dump_enabled_p ())
 694                     dump_printf_loc (MSG_NOTE, vect_location,
 695                                      "Detected vectorizable nested cycle.\n");
 696
 697                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 698                 }
 699               else
 700                 {
 701                   if (dump_enabled_p ())
 702                     dump_printf_loc (MSG_NOTE, vect_location,
 703                                      "Detected reduction.\n");
 704
 705                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 706                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
 707                   /* Store the reduction cycles for possible vectorization in
 708                      loop-aware SLP if it was not detected as reduction
 709                      chain.  */
 710                   if (! reduc_chain)
 711                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
 712                       (reduc_stmt_info);
 713                 }
 714             }
 715         }
 716       else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi))
 717         STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence;
 718       else
 719         if (dump_enabled_p ())
 720           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 721                            "Unknown def-use cycle pattern.\n");
 722     }
 723 }
 724
 725
 726 /* Function vect_analyze_scalar_cycles.
 727
 728    Examine the cross iteration def-use cycles of scalar variables, by
 729    analyzing the loop-header PHIs of scalar variables.  Classify each
 730    cycle as one of the following: invariant, induction, reduction, unknown.
 731    We do that for the loop represented by LOOP_VINFO, and also to its
 732    inner-loop, if exists.
 733    Examples for scalar cycles:
 734
 735    Example1: reduction:
 736
 737               loop1:
 738               for (i=0; i<N; i++)
 739                  sum += a[i];
 740
 741    Example2: induction:
 742
 743               loop2:
 744               for (i=0; i<N; i++)
 745                  a[i] = i;  */
 746
 747 static void
 748 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo, bool slp)
 749 {
 750   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 751
 752   vect_analyze_scalar_cycles_1 (loop_vinfo, loop, slp);
 753
 754   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 755      Reductions in such inner-loop therefore have different properties than
 756      the reductions in the nest that gets vectorized:
 757      1. When vectorized, they are executed in the same order as in the original
 758         scalar loop, so we can't change the order of computation when
 759         vectorizing them.
 760      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 761         current checks are too strict.  */
 762
 763   if (loop->inner)
 764     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner, slp);
 765 }
 766
 767 /* Transfer group and reduction information from STMT_INFO to its
 768    pattern stmt.  */
 769
 770 static void
 771 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
 772 {
 773   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
 774   stmt_vec_info stmtp;
 775   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
 776               && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
 777   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
 778   do
 779     {
 780       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
 781       gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
 782                            == STMT_VINFO_DEF_TYPE (stmt_info));
 783       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
 784       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
 785       if (stmt_info)
 786         REDUC_GROUP_NEXT_ELEMENT (stmtp)
 787           = STMT_VINFO_RELATED_STMT (stmt_info);
 788     }
 789   while (stmt_info);
 790 }
 791
 792 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 793
 794 static void
 795 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 796 {
 797   stmt_vec_info first;
 798   unsigned i;
 799
 800   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 801     {
 802       stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
 803       while (next)
 804         {
 805           if ((STMT_VINFO_IN_PATTERN_P (next)
 806                != STMT_VINFO_IN_PATTERN_P (first))
 807               || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
 808             break;
 809           next = REDUC_GROUP_NEXT_ELEMENT (next);
 810         }
 811       /* If all reduction chain members are well-formed patterns adjust
 812          the group to group the pattern stmts instead.  */
 813       if (! next
 814           && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
 815         {
 816           if (STMT_VINFO_IN_PATTERN_P (first))
 817             {
 818               vect_fixup_reduc_chain (first);
 819               LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 820                 = STMT_VINFO_RELATED_STMT (first);
 821             }
 822         }
 823       /* If not all stmt in the chain are patterns or if we failed
 824          to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
 825          it as regular reduction instead.  */
 826       else
 827         {
 828           stmt_vec_info vinfo = first;
 829           stmt_vec_info last = NULL;
 830           while (vinfo)
 831             {
 832               next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
 833               REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
 834               REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
 835               last = vinfo;
 836               vinfo = next;
 837             }
 838           STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
 839             = vect_internal_def;
 840           loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
 841           LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
 842           --i;
 843         }
 844     }
 845 }
 846
 847 /* Function vect_get_loop_niters.
 848
 849    Determine how many iterations the loop is executed and place it
 850    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 851    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 852    niter information holds in ASSUMPTIONS.
 853
 854    Return the loop exit conditions.  */
 855
 856
 857 static vec<gcond *>
 858 vect_get_loop_niters (class loop *loop, const_edge main_exit, tree *assumptions,
 859                       tree *number_of_iterations, tree *number_of_iterationsm1)
 860 {
 861   auto_vec<edge> exits = get_loop_exit_edges (loop);
 862   vec<gcond *> conds;
 863   conds.create (exits.length ());
 864   class tree_niter_desc niter_desc;
 865   tree niter_assumptions, niter, may_be_zero;
 866
 867   *assumptions = boolean_true_node;
 868   *number_of_iterationsm1 = chrec_dont_know;
 869   *number_of_iterations = chrec_dont_know;
 870
 871   DUMP_VECT_SCOPE ("get_loop_niters");
 872
 873   if (exits.is_empty ())
 874     return conds;
 875
 876   if (dump_enabled_p ())
 877     dump_printf_loc (MSG_NOTE, vect_location, "Loop has %d exits.\n",
 878                      exits.length ());
 879
 880   edge exit;
 881   unsigned int i;
 882   FOR_EACH_VEC_ELT (exits, i, exit)
 883     {
 884       gcond *cond = get_loop_exit_condition (exit);
 885       if (cond)
 886         conds.safe_push (cond);
 887
 888       if (dump_enabled_p ())
 889         dump_printf_loc (MSG_NOTE, vect_location, "Analyzing exit %d...\n", i);
 890
 891       if (exit != main_exit)
 892         continue;
 893
 894       may_be_zero = NULL_TREE;
 895       if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 896           || chrec_contains_undetermined (niter_desc.niter))
 897         continue;
 898
 899       niter_assumptions = niter_desc.assumptions;
 900       may_be_zero = niter_desc.may_be_zero;
 901       niter = niter_desc.niter;
 902
 903       if (may_be_zero && integer_zerop (may_be_zero))
 904         may_be_zero = NULL_TREE;
 905
 906       if (may_be_zero)
 907         {
 908           if (COMPARISON_CLASS_P (may_be_zero))
 909             {
 910               /* Try to combine may_be_zero with assumptions, this can simplify
 911                  computation of niter expression.  */
 912               if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 913                 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 914                                                  niter_assumptions,
 915                                                  fold_build1 (TRUTH_NOT_EXPR,
 916                                                               boolean_type_node,
 917                                                               may_be_zero));
 918               else
 919                 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 920                                      build_int_cst (TREE_TYPE (niter), 0),
 921                                      rewrite_to_non_trapping_overflow (niter));
 922
 923               may_be_zero = NULL_TREE;
 924             }
 925           else if (integer_nonzerop (may_be_zero))
 926             {
 927               *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 928               *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 929               continue;
 930             }
 931           else
 932             continue;
 933        }
 934
 935       /* Loop assumptions are based off the normal exit.  */
 936       *assumptions = niter_assumptions;
 937       *number_of_iterationsm1 = niter;
 938
 939       /* We want the number of loop header executions which is the number
 940          of latch executions plus one.
 941          ???  For UINT_MAX latch executions this number overflows to zero
 942          for loops like do { n++; } while (n != 0);  */
 943       if (niter && !chrec_contains_undetermined (niter))
 944         {
 945           niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter),
 946                                unshare_expr (niter),
 947                                build_int_cst (TREE_TYPE (niter), 1));
 948           if (TREE_CODE (niter) == INTEGER_CST
 949               && TREE_CODE (*number_of_iterationsm1) != INTEGER_CST)
 950             {
 951               /* If we manage to fold niter + 1 into INTEGER_CST even when
 952                  niter is some complex expression, ensure back
 953                  *number_of_iterationsm1 is an INTEGER_CST as well.  See
 954                  PR113210.  */
 955               *number_of_iterationsm1
 956                 = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), niter,
 957                                build_minus_one_cst (TREE_TYPE (niter)));
 958             }
 959         }
 960       *number_of_iterations = niter;
 961     }
 962
 963   if (dump_enabled_p ())
 964     dump_printf_loc (MSG_NOTE, vect_location, "All loop exits successfully analyzed.\n");
 965
 966   return conds;
 967 }
 968
 969 /*  Determine the main loop exit for the vectorizer.  */
 970
 971 edge
 972 vec_init_loop_exit_info (class loop *loop)
 973 {
 974   /* Before we begin we must first determine which exit is the main one and
 975      which are auxilary exits.  */
 976   auto_vec<edge> exits = get_loop_exit_edges (loop);
 977   if (exits.length () == 1)
 978     return exits[0];
 979
 980   /* If we have multiple exits we only support counting IV at the moment.
 981      Analyze all exits and return the last one we can analyze.  */
 982   class tree_niter_desc niter_desc;
 983   edge candidate = NULL;
 984   for (edge exit : exits)
 985     {
 986       if (!get_loop_exit_condition (exit))
 987         continue;
 988
 989       if (number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 990           && !chrec_contains_undetermined (niter_desc.niter))
 991         {
 992           tree may_be_zero = niter_desc.may_be_zero;
 993           if ((integer_zerop (may_be_zero)
 994                /* As we are handling may_be_zero that's not false by
 995                   rewriting niter to may_be_zero ? 0 : niter we require
 996                   an empty latch.  */
 997                || (single_pred_p (loop->latch)
 998                    && exit->src == single_pred (loop->latch)
 999                    && (integer_nonzerop (may_be_zero)
1000                        || COMPARISON_CLASS_P (may_be_zero))))
1001               && (!candidate
1002                   || dominated_by_p (CDI_DOMINATORS, exit->src,
1003                                      candidate->src)))
1004             candidate = exit;
1005         }
1006     }
1007
1008   return candidate;
1009 }
1010
1011 /* Function bb_in_loop_p
1012
1013    Used as predicate for dfs order traversal of the loop bbs.  */
1014
1015 static bool
1016 bb_in_loop_p (const_basic_block bb, const void *data)
1017 {
1018   const class loop *const loop = (const class loop *)data;
1019   if (flow_bb_inside_loop_p (loop, bb))
1020     return true;
1021   return false;
1022 }
1023
1024
1025 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1026    stmt_vec_info structs for all the stmts in LOOP_IN.  */
1027
1028 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
1029   : vec_info (vec_info::loop, shared),
1030     loop (loop_in),
1031     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
1032     num_itersm1 (NULL_TREE),
1033     num_iters (NULL_TREE),
1034     num_iters_unchanged (NULL_TREE),
1035     num_iters_assumptions (NULL_TREE),
1036     vector_costs (nullptr),
1037     scalar_costs (nullptr),
1038     th (0),
1039     versioning_threshold (0),
1040     vectorization_factor (0),
1041     main_loop_edge (nullptr),
1042     skip_main_loop_edge (nullptr),
1043     skip_this_loop_edge (nullptr),
1044     reusable_accumulators (),
1045     suggested_unroll_factor (1),
1046     max_vectorization_factor (0),
1047     mask_skip_niters (NULL_TREE),
1048     rgroup_compare_type (NULL_TREE),
1049     simd_if_cond (NULL_TREE),
1050     partial_vector_style (vect_partial_vectors_none),
1051     unaligned_dr (NULL),
1052     peeling_for_alignment (0),
1053     ptr_mask (0),
1054     ivexpr_map (NULL),
1055     scan_map (NULL),
1056     slp_unrolling_factor (1),
1057     inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
1058     vectorizable (false),
1059     can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
1060     using_partial_vectors_p (false),
1061     using_decrementing_iv_p (false),
1062     using_select_vl_p (false),
1063     epil_using_partial_vectors_p (false),
1064     partial_load_store_bias (0),
1065     peeling_for_gaps (false),
1066     peeling_for_niter (false),
1067     early_breaks (false),
1068     no_data_dependencies (false),
1069     has_mask_store (false),
1070     scalar_loop_scaling (profile_probability::uninitialized ()),
1071     scalar_loop (NULL),
1072     orig_loop_info (NULL),
1073     vec_loop_iv_exit (NULL),
1074     vec_epilogue_loop_iv_exit (NULL),
1075     scalar_loop_iv_exit (NULL)
1076 {
1077   /* CHECKME: We want to visit all BBs before their successors (except for
1078      latch blocks, for which this assertion wouldn't hold).  In the simple
1079      case of the loop forms we allow, a dfs order of the BBs would the same
1080      as reversed postorder traversal, so we are safe.  */
1081
1082   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1083                                           bbs, loop->num_nodes, loop);
1084   gcc_assert (nbbs == loop->num_nodes);
1085
1086   for (unsigned int i = 0; i < nbbs; i++)
1087     {
1088       basic_block bb = bbs[i];
1089       gimple_stmt_iterator si;
1090
1091       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1092         {
1093           gimple *phi = gsi_stmt (si);
1094           gimple_set_uid (phi, 0);
1095           add_stmt (phi);
1096         }
1097
1098       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1099         {
1100           gimple *stmt = gsi_stmt (si);
1101           gimple_set_uid (stmt, 0);
1102           if (is_gimple_debug (stmt))
1103             continue;
1104           add_stmt (stmt);
1105           /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
1106              third argument is the #pragma omp simd if (x) condition, when 0,
1107              loop shouldn't be vectorized, when non-zero constant, it should
1108              be vectorized normally, otherwise versioned with vectorized loop
1109              done if the condition is non-zero at runtime.  */
1110           if (loop_in->simduid
1111               && is_gimple_call (stmt)
1112               && gimple_call_internal_p (stmt)
1113               && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
1114               && gimple_call_num_args (stmt) >= 3
1115               && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
1116               && (loop_in->simduid
1117                   == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
1118             {
1119               tree arg = gimple_call_arg (stmt, 2);
1120               if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
1121                 simd_if_cond = arg;
1122               else
1123                 gcc_assert (integer_nonzerop (arg));
1124             }
1125         }
1126     }
1127
1128   epilogue_vinfos.create (6);
1129 }
1130
1131 /* Free all levels of rgroup CONTROLS.  */
1132
1133 void
1134 release_vec_loop_controls (vec<rgroup_controls> *controls)
1135 {
1136   rgroup_controls *rgc;
1137   unsigned int i;
1138   FOR_EACH_VEC_ELT (*controls, i, rgc)
1139     rgc->controls.release ();
1140   controls->release ();
1141 }
1142
1143 /* Free all memory used by the _loop_vec_info, as well as all the
1144    stmt_vec_info structs of all the stmts in the loop.  */
1145
1146 _loop_vec_info::~_loop_vec_info ()
1147 {
1148   free (bbs);
1149
1150   release_vec_loop_controls (&masks.rgc_vec);
1151   release_vec_loop_controls (&lens);
1152   delete ivexpr_map;
1153   delete scan_map;
1154   epilogue_vinfos.release ();
1155   delete scalar_costs;
1156   delete vector_costs;
1157
1158   /* When we release an epiloge vinfo that we do not intend to use
1159      avoid clearing AUX of the main loop which should continue to
1160      point to the main loop vinfo since otherwise we'll leak that.  */
1161   if (loop->aux == this)
1162     loop->aux = NULL;
1163 }
1164
1165 /* Return an invariant or register for EXPR and emit necessary
1166    computations in the LOOP_VINFO loop preheader.  */
1167
1168 tree
1169 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
1170 {
1171   if (is_gimple_reg (expr)
1172       || is_gimple_min_invariant (expr))
1173     return expr;
1174
1175   if (! loop_vinfo->ivexpr_map)
1176     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
1177   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
1178   if (! cached)
1179     {
1180       gimple_seq stmts = NULL;
1181       cached = force_gimple_operand (unshare_expr (expr),
1182                                      &stmts, true, NULL_TREE);
1183       if (stmts)
1184         {
1185           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1186           gsi_insert_seq_on_edge_immediate (e, stmts);
1187         }
1188     }
1189   return cached;
1190 }
1191
1192 /* Return true if we can use CMP_TYPE as the comparison type to produce
1193    all masks required to mask LOOP_VINFO.  */
1194
1195 static bool
1196 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1197 {
1198   rgroup_controls *rgm;
1199   unsigned int i;
1200   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1201     if (rgm->type != NULL_TREE
1202         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1203                                             cmp_type, rgm->type,
1204                                             OPTIMIZE_FOR_SPEED))
1205       return false;
1206   return true;
1207 }
1208
1209 /* Calculate the maximum number of scalars per iteration for every
1210    rgroup in LOOP_VINFO.  */
1211
1212 static unsigned int
1213 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1214 {
1215   unsigned int res = 1;
1216   unsigned int i;
1217   rgroup_controls *rgm;
1218   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1219     res = MAX (res, rgm->max_nscalars_per_iter);
1220   return res;
1221 }
1222
1223 /* Calculate the minimum precision necessary to represent:
1224
1225       MAX_NITERS * FACTOR
1226
1227    as an unsigned integer, where MAX_NITERS is the maximum number of
1228    loop header iterations for the original scalar form of LOOP_VINFO.  */
1229
1230 static unsigned
1231 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1232 {
1233   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1234
1235   /* Get the maximum number of iterations that is representable
1236      in the counter type.  */
1237   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1238   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1239
1240   /* Get a more refined estimate for the number of iterations.  */
1241   widest_int max_back_edges;
1242   if (max_loop_iterations (loop, &max_back_edges))
1243     max_ni = wi::smin (max_ni, max_back_edges + 1);
1244
1245   /* Work out how many bits we need to represent the limit.  */
1246   return wi::min_precision (max_ni * factor, UNSIGNED);
1247 }
1248
1249 /* True if the loop needs peeling or partial vectors when vectorized.  */
1250
1251 static bool
1252 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1253 {
1254   unsigned HOST_WIDE_INT const_vf;
1255   HOST_WIDE_INT max_niter
1256     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1257
1258   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1259   if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1260     th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1261                                           (loop_vinfo));
1262
1263   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1264       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1265     {
1266       /* Work out the (constant) number of iterations that need to be
1267          peeled for reasons other than niters.  */
1268       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1269       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1270         peel_niter += 1;
1271       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1272                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1273         return true;
1274     }
1275   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1276       /* ??? When peeling for gaps but not alignment, we could
1277          try to check whether the (variable) niters is known to be
1278          VF * N + 1.  That's something of a niche case though.  */
1279       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1280       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1281       || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1282            < (unsigned) exact_log2 (const_vf))
1283           /* In case of versioning, check if the maximum number of
1284              iterations is greater than th.  If they are identical,
1285              the epilogue is unnecessary.  */
1286           && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1287               || ((unsigned HOST_WIDE_INT) max_niter
1288                   /* We'd like to use LOOP_VINFO_VERSIONING_THRESHOLD
1289                      but that's only computed later based on our result.
1290                      The following is the most conservative approximation.  */
1291                   > (std::max ((unsigned HOST_WIDE_INT) th,
1292                                const_vf) / const_vf) * const_vf))))
1293     return true;
1294
1295   return false;
1296 }
1297
1298 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1299    whether we can actually generate the masks required.  Return true if so,
1300    storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE.  */
1301
1302 static bool
1303 vect_verify_full_masking (loop_vec_info loop_vinfo)
1304 {
1305   unsigned int min_ni_width;
1306
1307   /* Use a normal loop if there are no statements that need masking.
1308      This only happens in rare degenerate cases: it means that the loop
1309      has no loads, no stores, and no live-out values.  */
1310   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1311     return false;
1312
1313   /* Produce the rgroup controls.  */
1314   for (auto mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1315     {
1316       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1317       tree vectype = mask.first;
1318       unsigned nvectors = mask.second;
1319
1320       if (masks->rgc_vec.length () < nvectors)
1321         masks->rgc_vec.safe_grow_cleared (nvectors, true);
1322       rgroup_controls *rgm = &(*masks).rgc_vec[nvectors - 1];
1323       /* The number of scalars per iteration and the number of vectors are
1324          both compile-time constants.  */
1325       unsigned int nscalars_per_iter
1326           = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1327                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1328
1329       if (rgm->max_nscalars_per_iter < nscalars_per_iter)
1330         {
1331           rgm->max_nscalars_per_iter = nscalars_per_iter;
1332           rgm->type = truth_type_for (vectype);
1333           rgm->factor = 1;
1334         }
1335     }
1336
1337   unsigned int max_nscalars_per_iter
1338     = vect_get_max_nscalars_per_iter (loop_vinfo);
1339
1340   /* Work out how many bits we need to represent the limit.  */
1341   min_ni_width
1342     = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1343
1344   /* Find a scalar mode for which WHILE_ULT is supported.  */
1345   opt_scalar_int_mode cmp_mode_iter;
1346   tree cmp_type = NULL_TREE;
1347   tree iv_type = NULL_TREE;
1348   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1349   unsigned int iv_precision = UINT_MAX;
1350
1351   if (iv_limit != -1)
1352     iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1353                                       UNSIGNED);
1354
1355   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1356     {
1357       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1358       if (cmp_bits >= min_ni_width
1359           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1360         {
1361           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1362           if (this_type
1363               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1364             {
1365               /* Although we could stop as soon as we find a valid mode,
1366                  there are at least two reasons why that's not always the
1367                  best choice:
1368
1369                  - An IV that's Pmode or wider is more likely to be reusable
1370                    in address calculations than an IV that's narrower than
1371                    Pmode.
1372
1373                  - Doing the comparison in IV_PRECISION or wider allows
1374                    a natural 0-based IV, whereas using a narrower comparison
1375                    type requires mitigations against wrap-around.
1376
1377                  Conversely, if the IV limit is variable, doing the comparison
1378                  in a wider type than the original type can introduce
1379                  unnecessary extensions, so picking the widest valid mode
1380                  is not always a good choice either.
1381
1382                  Here we prefer the first IV type that's Pmode or wider,
1383                  and the first comparison type that's IV_PRECISION or wider.
1384                  (The comparison type must be no wider than the IV type,
1385                  to avoid extensions in the vector loop.)
1386
1387                  ??? We might want to try continuing beyond Pmode for ILP32
1388                  targets if CMP_BITS < IV_PRECISION.  */
1389               iv_type = this_type;
1390               if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1391                 cmp_type = this_type;
1392               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1393                 break;
1394             }
1395         }
1396     }
1397
1398   if (!cmp_type)
1399     {
1400       LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.release ();
1401       return false;
1402     }
1403
1404   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1405   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1406   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_while_ult;
1407   return true;
1408 }
1409
1410 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1411    whether we can actually generate AVX512 style masks.  Return true if so,
1412    storing the type of the scalar IV in LOOP_VINFO_RGROUP_IV_TYPE.  */
1413
1414 static bool
1415 vect_verify_full_masking_avx512 (loop_vec_info loop_vinfo)
1416 {
1417   /* Produce differently organized rgc_vec and differently check
1418      we can produce masks.  */
1419
1420   /* Use a normal loop if there are no statements that need masking.
1421      This only happens in rare degenerate cases: it means that the loop
1422      has no loads, no stores, and no live-out values.  */
1423   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1424     return false;
1425
1426   /* For the decrementing IV we need to represent all values in
1427      [0, niter + niter_skip] where niter_skip is the elements we
1428      skip in the first iteration for prologue peeling.  */
1429   tree iv_type = NULL_TREE;
1430   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1431   unsigned int iv_precision = UINT_MAX;
1432   if (iv_limit != -1)
1433     iv_precision = wi::min_precision (iv_limit, UNSIGNED);
1434
1435   /* First compute the type for the IV we use to track the remaining
1436      scalar iterations.  */
1437   opt_scalar_int_mode cmp_mode_iter;
1438   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1439     {
1440       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1441       if (cmp_bits >= iv_precision
1442           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1443         {
1444           iv_type = build_nonstandard_integer_type (cmp_bits, true);
1445           if (iv_type)
1446             break;
1447         }
1448     }
1449   if (!iv_type)
1450     return false;
1451
1452   /* Produce the rgroup controls.  */
1453   for (auto const &mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1454     {
1455       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1456       tree vectype = mask.first;
1457       unsigned nvectors = mask.second;
1458
1459       /* The number of scalars per iteration and the number of vectors are
1460          both compile-time constants.  */
1461       unsigned int nscalars_per_iter
1462         = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1463                      LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1464
1465       /* We index the rgroup_controls vector with nscalars_per_iter
1466          which we keep constant and instead have a varying nvectors,
1467          remembering the vector mask with the fewest nV.  */
1468       if (masks->rgc_vec.length () < nscalars_per_iter)
1469         masks->rgc_vec.safe_grow_cleared (nscalars_per_iter, true);
1470       rgroup_controls *rgm = &(*masks).rgc_vec[nscalars_per_iter - 1];
1471
1472       if (!rgm->type || rgm->factor > nvectors)
1473         {
1474           rgm->type = truth_type_for (vectype);
1475           rgm->compare_type = NULL_TREE;
1476           rgm->max_nscalars_per_iter = nscalars_per_iter;
1477           rgm->factor = nvectors;
1478           rgm->bias_adjusted_ctrl = NULL_TREE;
1479         }
1480     }
1481
1482   /* There is no fixed compare type we are going to use but we have to
1483      be able to get at one for each mask group.  */
1484   unsigned int min_ni_width
1485     = wi::min_precision (vect_max_vf (loop_vinfo), UNSIGNED);
1486
1487   bool ok = true;
1488   for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
1489     {
1490       tree mask_type = rgc.type;
1491       if (!mask_type)
1492         continue;
1493
1494       /* For now vect_get_loop_mask only supports integer mode masks
1495          when we need to split it.  */
1496       if (GET_MODE_CLASS (TYPE_MODE (mask_type)) != MODE_INT
1497           || TYPE_PRECISION (TREE_TYPE (mask_type)) != 1)
1498         {
1499           ok = false;
1500           break;
1501         }
1502
1503       /* If iv_type is usable as compare type use that - we can elide the
1504          saturation in that case.   */
1505       if (TYPE_PRECISION (iv_type) >= min_ni_width)
1506         {
1507           tree cmp_vectype
1508             = build_vector_type (iv_type, TYPE_VECTOR_SUBPARTS (mask_type));
1509           if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1510             rgc.compare_type = cmp_vectype;
1511         }
1512       if (!rgc.compare_type)
1513         FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1514           {
1515             unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1516             if (cmp_bits >= min_ni_width
1517                 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1518               {
1519                 tree cmp_type = build_nonstandard_integer_type (cmp_bits, true);
1520                 if (!cmp_type)
1521                   continue;
1522
1523                 /* Check whether we can produce the mask with cmp_type.  */
1524                 tree cmp_vectype
1525                   = build_vector_type (cmp_type, TYPE_VECTOR_SUBPARTS (mask_type));
1526                 if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1527                   {
1528                     rgc.compare_type = cmp_vectype;
1529                     break;
1530                   }
1531               }
1532         }
1533       if (!rgc.compare_type)
1534         {
1535           ok = false;
1536           break;
1537         }
1538     }
1539   if (!ok)
1540     {
1541       release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
1542       return false;
1543     }
1544
1545   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = error_mark_node;
1546   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1547   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_avx512;
1548   return true;
1549 }
1550
1551 /* Check whether we can use vector access with length based on precison
1552    comparison.  So far, to keep it simple, we only allow the case that the
1553    precision of the target supported length is larger than the precision
1554    required by loop niters.  */
1555
1556 static bool
1557 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1558 {
1559   if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1560     return false;
1561
1562   machine_mode len_load_mode, len_store_mode;
1563   if (!get_len_load_store_mode (loop_vinfo->vector_mode, true)
1564          .exists (&len_load_mode))
1565     return false;
1566   if (!get_len_load_store_mode (loop_vinfo->vector_mode, false)
1567          .exists (&len_store_mode))
1568     return false;
1569
1570   signed char partial_load_bias = internal_len_load_store_bias
1571     (IFN_LEN_LOAD, len_load_mode);
1572
1573   signed char partial_store_bias = internal_len_load_store_bias
1574     (IFN_LEN_STORE, len_store_mode);
1575
1576   gcc_assert (partial_load_bias == partial_store_bias);
1577
1578   if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1579     return false;
1580
1581   /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1582      len_loads with a length of zero.  In order to avoid that we prohibit
1583      more than one loop length here.  */
1584   if (partial_load_bias == -1
1585       && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1586     return false;
1587
1588   LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1589
1590   unsigned int max_nitems_per_iter = 1;
1591   unsigned int i;
1592   rgroup_controls *rgl;
1593   /* Find the maximum number of items per iteration for every rgroup.  */
1594   FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1595     {
1596       unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1597       max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1598     }
1599
1600   /* Work out how many bits we need to represent the length limit.  */
1601   unsigned int min_ni_prec
1602     = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1603
1604   /* Now use the maximum of below precisions for one suitable IV type:
1605      - the IV's natural precision
1606      - the precision needed to hold: the maximum number of scalar
1607        iterations multiplied by the scale factor (min_ni_prec above)
1608      - the Pmode precision
1609
1610      If min_ni_prec is less than the precision of the current niters,
1611      we perfer to still use the niters type.  Prefer to use Pmode and
1612      wider IV to avoid narrow conversions.  */
1613
1614   unsigned int ni_prec
1615     = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1616   min_ni_prec = MAX (min_ni_prec, ni_prec);
1617   min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1618
1619   tree iv_type = NULL_TREE;
1620   opt_scalar_int_mode tmode_iter;
1621   FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1622     {
1623       scalar_mode tmode = tmode_iter.require ();
1624       unsigned int tbits = GET_MODE_BITSIZE (tmode);
1625
1626       /* ??? Do we really want to construct one IV whose precision exceeds
1627          BITS_PER_WORD?  */
1628       if (tbits > BITS_PER_WORD)
1629         break;
1630
1631       /* Find the first available standard integral type.  */
1632       if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1633         {
1634           iv_type = build_nonstandard_integer_type (tbits, true);
1635           break;
1636         }
1637     }
1638
1639   if (!iv_type)
1640     {
1641       if (dump_enabled_p ())
1642         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1643                          "can't vectorize with length-based partial vectors"
1644                          " because there is no suitable iv type.\n");
1645       return false;
1646     }
1647
1648   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1649   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1650   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_len;
1651
1652   return true;
1653 }
1654
1655 /* Calculate the cost of one scalar iteration of the loop.  */
1656 static void
1657 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1658 {
1659   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1660   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1661   int nbbs = loop->num_nodes, factor;
1662   int innerloop_iters, i;
1663
1664   DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1665
1666   /* Gather costs for statements in the scalar loop.  */
1667
1668   /* FORNOW.  */
1669   innerloop_iters = 1;
1670   if (loop->inner)
1671     innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1672
1673   for (i = 0; i < nbbs; i++)
1674     {
1675       gimple_stmt_iterator si;
1676       basic_block bb = bbs[i];
1677
1678       if (bb->loop_father == loop->inner)
1679         factor = innerloop_iters;
1680       else
1681         factor = 1;
1682
1683       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1684         {
1685           gimple *stmt = gsi_stmt (si);
1686           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1687
1688           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1689             continue;
1690
1691           /* Skip stmts that are not vectorized inside the loop.  */
1692           stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1693           if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1694               && (!STMT_VINFO_LIVE_P (vstmt_info)
1695                   || !VECTORIZABLE_CYCLE_DEF
1696                         (STMT_VINFO_DEF_TYPE (vstmt_info))))
1697             continue;
1698
1699           vect_cost_for_stmt kind;
1700           if (STMT_VINFO_DATA_REF (stmt_info))
1701             {
1702               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1703                kind = scalar_load;
1704              else
1705                kind = scalar_store;
1706             }
1707           else if (vect_nop_conversion_p (stmt_info))
1708             continue;
1709           else
1710             kind = scalar_stmt;
1711
1712           /* We are using vect_prologue here to avoid scaling twice
1713              by the inner loop factor.  */
1714           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1715                             factor, kind, stmt_info, 0, vect_prologue);
1716         }
1717     }
1718
1719   /* Now accumulate cost.  */
1720   loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1721   add_stmt_costs (loop_vinfo->scalar_costs,
1722                   &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1723   loop_vinfo->scalar_costs->finish_cost (nullptr);
1724 }
1725
1726 /* Function vect_analyze_loop_form.
1727
1728    Verify that certain CFG restrictions hold, including:
1729    - the loop has a pre-header
1730    - the loop has a single entry
1731    - nested loops can have only a single exit.
1732    - the loop exit condition is simple enough
1733    - the number of iterations can be analyzed, i.e, a countable loop.  The
1734      niter could be analyzed under some assumptions.  */
1735
1736 opt_result
1737 vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info)
1738 {
1739   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1740
1741   edge exit_e = vec_init_loop_exit_info (loop);
1742   if (!exit_e)
1743     return opt_result::failure_at (vect_location,
1744                                    "not vectorized:"
1745                                    " could not determine main exit from"
1746                                    " loop with multiple exits.\n");
1747   info->loop_exit = exit_e;
1748   if (dump_enabled_p ())
1749       dump_printf_loc (MSG_NOTE, vect_location,
1750                        "using as main loop exit: %d -> %d [AUX: %p]\n",
1751                        exit_e->src->index, exit_e->dest->index, exit_e->aux);
1752
1753   /* Check if we have any control flow that doesn't leave the loop.  */
1754   class loop *v_loop = loop->inner ? loop->inner : loop;
1755   basic_block *bbs = get_loop_body (v_loop);
1756   for (unsigned i = 0; i < v_loop->num_nodes; i++)
1757     if (EDGE_COUNT (bbs[i]->succs) != 1
1758         && (EDGE_COUNT (bbs[i]->succs) != 2
1759             || !loop_exits_from_bb_p (bbs[i]->loop_father, bbs[i])))
1760       {
1761         free (bbs);
1762         return opt_result::failure_at (vect_location,
1763                                        "not vectorized:"
1764                                        " unsupported control flow in loop.\n");
1765       }
1766   free (bbs);
1767
1768   /* Different restrictions apply when we are considering an inner-most loop,
1769      vs. an outer (nested) loop.
1770      (FORNOW. May want to relax some of these restrictions in the future).  */
1771
1772   info->inner_loop_cond = NULL;
1773   if (!loop->inner)
1774     {
1775       /* Inner-most loop.  */
1776
1777       if (empty_block_p (loop->header))
1778         return opt_result::failure_at (vect_location,
1779                                        "not vectorized: empty loop.\n");
1780     }
1781   else
1782     {
1783       class loop *innerloop = loop->inner;
1784       edge entryedge;
1785
1786       /* Nested loop. We currently require that the loop is doubly-nested,
1787          contains a single inner loop with a single exit to the block
1788          with the single exit condition in the outer loop.
1789          Vectorizable outer-loops look like this:
1790
1791                         (pre-header)
1792                            |
1793                           header <---+
1794                            |         |
1795                           inner-loop |
1796                            |         |
1797                           tail ------+
1798                            |
1799                         (exit-bb)
1800
1801          The inner-loop also has the properties expected of inner-most loops
1802          as described above.  */
1803
1804       if ((loop->inner)->inner || (loop->inner)->next)
1805         return opt_result::failure_at (vect_location,
1806                                        "not vectorized:"
1807                                        " multiple nested loops.\n");
1808
1809       entryedge = loop_preheader_edge (innerloop);
1810       if (entryedge->src != loop->header
1811           || !single_exit (innerloop)
1812           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1813         return opt_result::failure_at (vect_location,
1814                                        "not vectorized:"
1815                                        " unsupported outerloop form.\n");
1816
1817       /* Analyze the inner-loop.  */
1818       vect_loop_form_info inner;
1819       opt_result res = vect_analyze_loop_form (loop->inner, &inner);
1820       if (!res)
1821         {
1822           if (dump_enabled_p ())
1823             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1824                              "not vectorized: Bad inner loop.\n");
1825           return res;
1826         }
1827
1828       /* Don't support analyzing niter under assumptions for inner
1829          loop.  */
1830       if (!integer_onep (inner.assumptions))
1831         return opt_result::failure_at (vect_location,
1832                                        "not vectorized: Bad inner loop.\n");
1833
1834       if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1835         return opt_result::failure_at (vect_location,
1836                                        "not vectorized: inner-loop count not"
1837                                        " invariant.\n");
1838
1839       if (dump_enabled_p ())
1840         dump_printf_loc (MSG_NOTE, vect_location,
1841                          "Considering outer-loop vectorization.\n");
1842       info->inner_loop_cond = inner.conds[0];
1843     }
1844
1845   if (EDGE_COUNT (loop->header->preds) != 2)
1846     return opt_result::failure_at (vect_location,
1847                                    "not vectorized:"
1848                                    " too many incoming edges.\n");
1849
1850   /* We assume that the latch is empty.  */
1851   if (!empty_block_p (loop->latch)
1852       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1853     return opt_result::failure_at (vect_location,
1854                                    "not vectorized: latch block not empty.\n");
1855
1856   /* Make sure there is no abnormal exit.  */
1857   auto_vec<edge> exits = get_loop_exit_edges (loop);
1858   for (edge e : exits)
1859     {
1860       if (e->flags & EDGE_ABNORMAL)
1861         return opt_result::failure_at (vect_location,
1862                                        "not vectorized:"
1863                                        " abnormal loop exit edge.\n");
1864     }
1865
1866   info->conds
1867     = vect_get_loop_niters (loop, exit_e, &info->assumptions,
1868                             &info->number_of_iterations,
1869                             &info->number_of_iterationsm1);
1870   if (info->conds.is_empty ())
1871     return opt_result::failure_at
1872       (vect_location,
1873        "not vectorized: complicated exit condition.\n");
1874
1875   /* Determine what the primary and alternate exit conds are.  */
1876   for (unsigned i = 0; i < info->conds.length (); i++)
1877     {
1878       gcond *cond = info->conds[i];
1879       if (exit_e->src == gimple_bb (cond))
1880         std::swap (info->conds[0], info->conds[i]);
1881     }
1882
1883   if (integer_zerop (info->assumptions)
1884       || !info->number_of_iterations
1885       || chrec_contains_undetermined (info->number_of_iterations))
1886     return opt_result::failure_at
1887       (info->conds[0],
1888        "not vectorized: number of iterations cannot be computed.\n");
1889
1890   if (integer_zerop (info->number_of_iterations))
1891     return opt_result::failure_at
1892       (info->conds[0],
1893        "not vectorized: number of iterations = 0.\n");
1894
1895   if (!(tree_fits_shwi_p (info->number_of_iterations)
1896         && tree_to_shwi (info->number_of_iterations) > 0))
1897     {
1898       if (dump_enabled_p ())
1899         {
1900           dump_printf_loc (MSG_NOTE, vect_location,
1901                            "Symbolic number of iterations is ");
1902           dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1903           dump_printf (MSG_NOTE, "\n");
1904         }
1905     }
1906
1907   return opt_result::success ();
1908 }
1909
1910 /* Create a loop_vec_info for LOOP with SHARED and the
1911    vect_analyze_loop_form result.  */
1912
1913 loop_vec_info
1914 vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1915                         const vect_loop_form_info *info,
1916                         loop_vec_info main_loop_info)
1917 {
1918   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1919   LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1920   LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1921   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1922   LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = main_loop_info;
1923   /* Also record the assumptions for versioning.  */
1924   if (!integer_onep (info->assumptions) && !main_loop_info)
1925     LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1926
1927   for (gcond *cond : info->conds)
1928     {
1929       stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (cond);
1930       STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1931       /* Mark the statement as a condition.  */
1932       STMT_VINFO_DEF_TYPE (loop_cond_info) = vect_condition_def;
1933     }
1934
1935   for (unsigned i = 1; i < info->conds.length (); i ++)
1936     LOOP_VINFO_LOOP_CONDS (loop_vinfo).safe_push (info->conds[i]);
1937   LOOP_VINFO_LOOP_IV_COND (loop_vinfo) = info->conds[0];
1938
1939   LOOP_VINFO_IV_EXIT (loop_vinfo) = info->loop_exit;
1940
1941   /* Check to see if we're vectorizing multiple exits.  */
1942   LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
1943     = !LOOP_VINFO_LOOP_CONDS (loop_vinfo).is_empty ();
1944
1945   if (info->inner_loop_cond)
1946     {
1947       stmt_vec_info inner_loop_cond_info
1948         = loop_vinfo->lookup_stmt (info->inner_loop_cond);
1949       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1950       /* If we have an estimate on the number of iterations of the inner
1951          loop use that to limit the scale for costing, otherwise use
1952          --param vect-inner-loop-cost-factor literally.  */
1953       widest_int nit;
1954       if (estimated_stmt_executions (loop->inner, &nit))
1955         LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1956           = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1957     }
1958
1959   return loop_vinfo;
1960 }
1961
1962
1963
1964 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1965    statements update the vectorization factor.  */
1966
1967 static void
1968 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1969 {
1970   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1971   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1972   int nbbs = loop->num_nodes;
1973   poly_uint64 vectorization_factor;
1974   int i;
1975
1976   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1977
1978   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1979   gcc_assert (known_ne (vectorization_factor, 0U));
1980
1981   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1982      vectorization factor of the loop is the unrolling factor required by
1983      the SLP instances.  If that unrolling factor is 1, we say, that we
1984      perform pure SLP on loop - cross iteration parallelism is not
1985      exploited.  */
1986   bool only_slp_in_loop = true;
1987   for (i = 0; i < nbbs; i++)
1988     {
1989       basic_block bb = bbs[i];
1990       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1991            gsi_next (&si))
1992         {
1993           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1994           if (!stmt_info)
1995             continue;
1996           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1997                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1998               && !PURE_SLP_STMT (stmt_info))
1999             /* STMT needs both SLP and loop-based vectorization.  */
2000             only_slp_in_loop = false;
2001         }
2002       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
2003            gsi_next (&si))
2004         {
2005           if (is_gimple_debug (gsi_stmt (si)))
2006             continue;
2007           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2008           stmt_info = vect_stmt_to_vectorize (stmt_info);
2009           if ((STMT_VINFO_RELEVANT_P (stmt_info)
2010                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2011               && !PURE_SLP_STMT (stmt_info))
2012             /* STMT needs both SLP and loop-based vectorization.  */
2013             only_slp_in_loop = false;
2014         }
2015     }
2016
2017   if (only_slp_in_loop)
2018     {
2019       if (dump_enabled_p ())
2020         dump_printf_loc (MSG_NOTE, vect_location,
2021                          "Loop contains only SLP stmts\n");
2022       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
2023     }
2024   else
2025     {
2026       if (dump_enabled_p ())
2027         dump_printf_loc (MSG_NOTE, vect_location,
2028                          "Loop contains SLP and non-SLP stmts\n");
2029       /* Both the vectorization factor and unroll factor have the form
2030          GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
2031          so they must have a common multiple.  */
2032       vectorization_factor
2033         = force_common_multiple (vectorization_factor,
2034                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
2035     }
2036
2037   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
2038   if (dump_enabled_p ())
2039     {
2040       dump_printf_loc (MSG_NOTE, vect_location,
2041                        "Updating vectorization factor to ");
2042       dump_dec (MSG_NOTE, vectorization_factor);
2043       dump_printf (MSG_NOTE, ".\n");
2044     }
2045 }
2046
2047 /* Return true if STMT_INFO describes a double reduction phi and if
2048    the other phi in the reduction is also relevant for vectorization.
2049    This rejects cases such as:
2050
2051       outer1:
2052         x_1 = PHI <x_3(outer2), ...>;
2053         ...
2054
2055       inner:
2056         x_2 = ...;
2057         ...
2058
2059       outer2:
2060         x_3 = PHI <x_2(inner)>;
2061
2062    if nothing in x_2 or elsewhere makes x_1 relevant.  */
2063
2064 static bool
2065 vect_active_double_reduction_p (stmt_vec_info stmt_info)
2066 {
2067   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2068     return false;
2069
2070   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
2071 }
2072
2073 /* Function vect_analyze_loop_operations.
2074
2075    Scan the loop stmts and make sure they are all vectorizable.  */
2076
2077 static opt_result
2078 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
2079 {
2080   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2081   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2082   int nbbs = loop->num_nodes;
2083   int i;
2084   stmt_vec_info stmt_info;
2085   bool need_to_vectorize = false;
2086   bool ok;
2087
2088   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
2089
2090   auto_vec<stmt_info_for_cost> cost_vec;
2091
2092   for (i = 0; i < nbbs; i++)
2093     {
2094       basic_block bb = bbs[i];
2095
2096       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
2097            gsi_next (&si))
2098         {
2099           gphi *phi = si.phi ();
2100           ok = true;
2101
2102           stmt_info = loop_vinfo->lookup_stmt (phi);
2103           if (dump_enabled_p ())
2104             dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G",
2105                              (gimple *) phi);
2106           if (virtual_operand_p (gimple_phi_result (phi)))
2107             continue;
2108
2109           /* Inner-loop loop-closed exit phi in outer-loop vectorization
2110              (i.e., a phi in the tail of the outer-loop).  */
2111           if (! is_loop_header_bb_p (bb))
2112             {
2113               /* FORNOW: we currently don't support the case that these phis
2114                  are not used in the outerloop (unless it is double reduction,
2115                  i.e., this phi is vect_reduction_def), cause this case
2116                  requires to actually do something here.  */
2117               if (STMT_VINFO_LIVE_P (stmt_info)
2118                   && !vect_active_double_reduction_p (stmt_info))
2119                 return opt_result::failure_at (phi,
2120                                                "Unsupported loop-closed phi"
2121                                                " in outer-loop.\n");
2122
2123               /* If PHI is used in the outer loop, we check that its operand
2124                  is defined in the inner loop.  */
2125               if (STMT_VINFO_RELEVANT_P (stmt_info))
2126                 {
2127                   tree phi_op;
2128
2129                   if (gimple_phi_num_args (phi) != 1)
2130                     return opt_result::failure_at (phi, "unsupported phi");
2131
2132                   phi_op = PHI_ARG_DEF (phi, 0);
2133                   stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
2134                   if (!op_def_info)
2135                     return opt_result::failure_at (phi, "unsupported phi\n");
2136
2137                   if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
2138                       && (STMT_VINFO_RELEVANT (op_def_info)
2139                           != vect_used_in_outer_by_reduction))
2140                     return opt_result::failure_at (phi, "unsupported phi\n");
2141
2142                   if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2143                        || (STMT_VINFO_DEF_TYPE (stmt_info)
2144                            == vect_double_reduction_def))
2145                       && !vectorizable_lc_phi (loop_vinfo,
2146                                                stmt_info, NULL, NULL))
2147                     return opt_result::failure_at (phi, "unsupported phi\n");
2148                 }
2149
2150               continue;
2151             }
2152
2153           gcc_assert (stmt_info);
2154
2155           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
2156                || STMT_VINFO_LIVE_P (stmt_info))
2157               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def
2158               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
2159             /* A scalar-dependence cycle that we don't support.  */
2160             return opt_result::failure_at (phi,
2161                                            "not vectorized:"
2162                                            " scalar dependence cycle.\n");
2163
2164           if (STMT_VINFO_RELEVANT_P (stmt_info))
2165             {
2166               need_to_vectorize = true;
2167               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
2168                   && ! PURE_SLP_STMT (stmt_info))
2169                 ok = vectorizable_induction (loop_vinfo,
2170                                              stmt_info, NULL, NULL,
2171                                              &cost_vec);
2172               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2173                         || (STMT_VINFO_DEF_TYPE (stmt_info)
2174                             == vect_double_reduction_def)
2175                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
2176                        && ! PURE_SLP_STMT (stmt_info))
2177                 ok = vectorizable_reduction (loop_vinfo,
2178                                              stmt_info, NULL, NULL, &cost_vec);
2179               else if ((STMT_VINFO_DEF_TYPE (stmt_info)
2180                         == vect_first_order_recurrence)
2181                        && ! PURE_SLP_STMT (stmt_info))
2182                 ok = vectorizable_recurr (loop_vinfo, stmt_info, NULL, NULL,
2183                                            &cost_vec);
2184             }
2185
2186           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
2187           if (ok
2188               && STMT_VINFO_LIVE_P (stmt_info)
2189               && !PURE_SLP_STMT (stmt_info))
2190             ok = vectorizable_live_operation (loop_vinfo, stmt_info, NULL, NULL,
2191                                               -1, false, &cost_vec);
2192
2193           if (!ok)
2194             return opt_result::failure_at (phi,
2195                                            "not vectorized: relevant phi not "
2196                                            "supported: %G",
2197                                            static_cast <gimple *> (phi));
2198         }
2199
2200       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
2201            gsi_next (&si))
2202         {
2203           gimple *stmt = gsi_stmt (si);
2204           if (!gimple_clobber_p (stmt)
2205               && !is_gimple_debug (stmt))
2206             {
2207               opt_result res
2208                 = vect_analyze_stmt (loop_vinfo,
2209                                      loop_vinfo->lookup_stmt (stmt),
2210                                      &need_to_vectorize,
2211                                      NULL, NULL, &cost_vec);
2212               if (!res)
2213                 return res;
2214             }
2215         }
2216     } /* bbs */
2217
2218   add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
2219
2220   /* All operations in the loop are either irrelevant (deal with loop
2221      control, or dead), or only used outside the loop and can be moved
2222      out of the loop (e.g. invariants, inductions).  The loop can be
2223      optimized away by scalar optimizations.  We're better off not
2224      touching this loop.  */
2225   if (!need_to_vectorize)
2226     {
2227       if (dump_enabled_p ())
2228         dump_printf_loc (MSG_NOTE, vect_location,
2229                          "All the computation can be taken out of the loop.\n");
2230       return opt_result::failure_at
2231         (vect_location,
2232          "not vectorized: redundant loop. no profit to vectorize.\n");
2233     }
2234
2235   return opt_result::success ();
2236 }
2237
2238 /* Return true if we know that the iteration count is smaller than the
2239    vectorization factor.  Return false if it isn't, or if we can't be sure
2240    either way.  */
2241
2242 static bool
2243 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
2244 {
2245   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2246
2247   HOST_WIDE_INT max_niter;
2248   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2249     max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
2250   else
2251     max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2252
2253   if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
2254     return true;
2255
2256   return false;
2257 }
2258
2259 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
2260    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
2261    definitely no, or -1 if it's worth retrying.  */
2262
2263 static int
2264 vect_analyze_loop_costing (loop_vec_info loop_vinfo,
2265                            unsigned *suggested_unroll_factor)
2266 {
2267   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2268   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2269
2270   /* Only loops that can handle partially-populated vectors can have iteration
2271      counts less than the vectorization factor.  */
2272   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2273       && vect_known_niters_smaller_than_vf (loop_vinfo))
2274     {
2275       if (dump_enabled_p ())
2276         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2277                          "not vectorized: iteration count smaller than "
2278                          "vectorization factor.\n");
2279       return 0;
2280     }
2281
2282   /* If we know the number of iterations we can do better, for the
2283      epilogue we can also decide whether the main loop leaves us
2284      with enough iterations, prefering a smaller vector epilog then
2285      also possibly used for the case we skip the vector loop.  */
2286   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2287     {
2288       widest_int scalar_niters
2289         = wi::to_widest (LOOP_VINFO_NITERSM1 (loop_vinfo)) + 1;
2290       if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2291         {
2292           loop_vec_info orig_loop_vinfo
2293             = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2294           unsigned lowest_vf
2295             = constant_lower_bound (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo));
2296           int prolog_peeling = 0;
2297           if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2298             prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo);
2299           if (prolog_peeling >= 0
2300               && known_eq (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
2301                            lowest_vf))
2302             {
2303               unsigned gap
2304                 = LOOP_VINFO_PEELING_FOR_GAPS (orig_loop_vinfo) ? 1 : 0;
2305               scalar_niters = ((scalar_niters - gap - prolog_peeling)
2306                                % lowest_vf + gap);
2307             }
2308         }
2309       /* Reject vectorizing for a single scalar iteration, even if
2310          we could in principle implement that using partial vectors.  */
2311       unsigned peeling_gap = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2312       if (scalar_niters <= peeling_gap + 1)
2313         {
2314           if (dump_enabled_p ())
2315             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2316                              "not vectorized: loop only has a single "
2317                              "scalar iteration.\n");
2318           return 0;
2319         }
2320
2321       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2322         {
2323           /* Check that the loop processes at least one full vector.  */
2324           poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2325           if (known_lt (scalar_niters, vf))
2326             {
2327               if (dump_enabled_p ())
2328                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2329                                  "loop does not have enough iterations "
2330                                  "to support vectorization.\n");
2331               return 0;
2332             }
2333
2334           /* If we need to peel an extra epilogue iteration to handle data
2335              accesses with gaps, check that there are enough scalar iterations
2336              available.
2337
2338              The check above is redundant with this one when peeling for gaps,
2339              but the distinction is useful for diagnostics.  */
2340           if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2341               && known_le (scalar_niters, vf))
2342             {
2343               if (dump_enabled_p ())
2344                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2345                                  "loop does not have enough iterations "
2346                                  "to support peeling for gaps.\n");
2347               return 0;
2348             }
2349         }
2350     }
2351
2352   /* If using the "very cheap" model. reject cases in which we'd keep
2353      a copy of the scalar code (even if we might be able to vectorize it).  */
2354   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2355       && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2356           || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2357           || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
2358     {
2359       if (dump_enabled_p ())
2360         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2361                          "some scalar iterations would need to be peeled\n");
2362       return 0;
2363     }
2364
2365   int min_profitable_iters, min_profitable_estimate;
2366   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2367                                       &min_profitable_estimate,
2368                                       suggested_unroll_factor);
2369
2370   if (min_profitable_iters < 0)
2371     {
2372       if (dump_enabled_p ())
2373         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2374                          "not vectorized: vectorization not profitable.\n");
2375       if (dump_enabled_p ())
2376         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2377                          "not vectorized: vector version will never be "
2378                          "profitable.\n");
2379       return -1;
2380     }
2381
2382   int min_scalar_loop_bound = (param_min_vect_loop_bound
2383                                * assumed_vf);
2384
2385   /* Use the cost model only if it is more conservative than user specified
2386      threshold.  */
2387   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
2388                                     min_profitable_iters);
2389
2390   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2391
2392   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2393       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2394     {
2395       if (dump_enabled_p ())
2396         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2397                          "not vectorized: vectorization not profitable.\n");
2398       if (dump_enabled_p ())
2399         dump_printf_loc (MSG_NOTE, vect_location,
2400                          "not vectorized: iteration count smaller than user "
2401                          "specified loop bound parameter or minimum profitable "
2402                          "iterations (whichever is more conservative).\n");
2403       return 0;
2404     }
2405
2406   /* The static profitablity threshold min_profitable_estimate includes
2407      the cost of having to check at runtime whether the scalar loop
2408      should be used instead.  If it turns out that we don't need or want
2409      such a check, the threshold we should use for the static estimate
2410      is simply the point at which the vector loop becomes more profitable
2411      than the scalar loop.  */
2412   if (min_profitable_estimate > min_profitable_iters
2413       && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
2414       && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2415       && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2416       && !vect_apply_runtime_profitability_check_p (loop_vinfo))
2417     {
2418       if (dump_enabled_p ())
2419         dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
2420                          " choice between the scalar and vector loops\n");
2421       min_profitable_estimate = min_profitable_iters;
2422     }
2423
2424   /* If the vector loop needs multiple iterations to be beneficial then
2425      things are probably too close to call, and the conservative thing
2426      would be to stick with the scalar code.  */
2427   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2428       && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
2429     {
2430       if (dump_enabled_p ())
2431         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2432                          "one iteration of the vector loop would be"
2433                          " more expensive than the equivalent number of"
2434                          " iterations of the scalar loop\n");
2435       return 0;
2436     }
2437
2438   HOST_WIDE_INT estimated_niter;
2439
2440   /* If we are vectorizing an epilogue then we know the maximum number of
2441      scalar iterations it will cover is at least one lower than the
2442      vectorization factor of the main loop.  */
2443   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2444     estimated_niter
2445       = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
2446   else
2447     {
2448       estimated_niter = estimated_stmt_executions_int (loop);
2449       if (estimated_niter == -1)
2450         estimated_niter = likely_max_stmt_executions_int (loop);
2451     }
2452   if (estimated_niter != -1
2453       && ((unsigned HOST_WIDE_INT) estimated_niter
2454           < MAX (th, (unsigned) min_profitable_estimate)))
2455     {
2456       if (dump_enabled_p ())
2457         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2458                          "not vectorized: estimated iteration count too "
2459                          "small.\n");
2460       if (dump_enabled_p ())
2461         dump_printf_loc (MSG_NOTE, vect_location,
2462                          "not vectorized: estimated iteration count smaller "
2463                          "than specified loop bound parameter or minimum "
2464                          "profitable iterations (whichever is more "
2465                          "conservative).\n");
2466       return -1;
2467     }
2468
2469   return 1;
2470 }
2471
2472 static opt_result
2473 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
2474                            vec<data_reference_p> *datarefs,
2475                            unsigned int *n_stmts)
2476 {
2477   *n_stmts = 0;
2478   for (unsigned i = 0; i < loop->num_nodes; i++)
2479     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2480          !gsi_end_p (gsi); gsi_next (&gsi))
2481       {
2482         gimple *stmt = gsi_stmt (gsi);
2483         if (is_gimple_debug (stmt))
2484           continue;
2485         ++(*n_stmts);
2486         opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
2487                                                         NULL, 0);
2488         if (!res)
2489           {
2490             if (is_gimple_call (stmt) && loop->safelen)
2491               {
2492                 tree fndecl = gimple_call_fndecl (stmt), op;
2493                 if (fndecl == NULL_TREE
2494                     && gimple_call_internal_p (stmt, IFN_MASK_CALL))
2495                   {
2496                     fndecl = gimple_call_arg (stmt, 0);
2497                     gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
2498                     fndecl = TREE_OPERAND (fndecl, 0);
2499                     gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
2500                   }
2501                 if (fndecl != NULL_TREE)
2502                   {
2503                     cgraph_node *node = cgraph_node::get (fndecl);
2504                     if (node != NULL && node->simd_clones != NULL)
2505                       {
2506                         unsigned int j, n = gimple_call_num_args (stmt);
2507                         for (j = 0; j < n; j++)
2508                           {
2509                             op = gimple_call_arg (stmt, j);
2510                             if (DECL_P (op)
2511                                 || (REFERENCE_CLASS_P (op)
2512                                     && get_base_address (op)))
2513                               break;
2514                           }
2515                         op = gimple_call_lhs (stmt);
2516                         /* Ignore #pragma omp declare simd functions
2517                            if they don't have data references in the
2518                            call stmt itself.  */
2519                         if (j == n
2520                             && !(op
2521                                  && (DECL_P (op)
2522                                      || (REFERENCE_CLASS_P (op)
2523                                          && get_base_address (op)))))
2524                           continue;
2525                       }
2526                   }
2527               }
2528             return res;
2529           }
2530         /* If dependence analysis will give up due to the limit on the
2531            number of datarefs stop here and fail fatally.  */
2532         if (datarefs->length ()
2533             > (unsigned)param_loop_max_datarefs_for_datadeps)
2534           return opt_result::failure_at (stmt, "exceeded param "
2535                                          "loop-max-datarefs-for-datadeps\n");
2536       }
2537   return opt_result::success ();
2538 }
2539
2540 /* Look for SLP-only access groups and turn each individual access into its own
2541    group.  */
2542 static void
2543 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2544 {
2545   unsigned int i;
2546   struct data_reference *dr;
2547
2548   DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2549
2550   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2551   FOR_EACH_VEC_ELT (datarefs, i, dr)
2552     {
2553       gcc_assert (DR_REF (dr));
2554       stmt_vec_info stmt_info
2555         = vect_stmt_to_vectorize (loop_vinfo->lookup_stmt (DR_STMT (dr)));
2556
2557       /* Check if the load is a part of an interleaving chain.  */
2558       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2559         {
2560           stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2561           dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2562           unsigned int group_size = DR_GROUP_SIZE (first_element);
2563
2564           /* Check if SLP-only groups.  */
2565           if (!STMT_SLP_TYPE (stmt_info)
2566               && STMT_VINFO_SLP_VECT_ONLY (first_element))
2567             {
2568               /* Dissolve the group.  */
2569               STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2570
2571               stmt_vec_info vinfo = first_element;
2572               while (vinfo)
2573                 {
2574                   stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2575                   DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2576                   DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2577                   DR_GROUP_SIZE (vinfo) = 1;
2578                   if (STMT_VINFO_STRIDED_P (first_element)
2579                       /* We cannot handle stores with gaps.  */
2580                       || DR_IS_WRITE (dr_info->dr))
2581                     {
2582                       STMT_VINFO_STRIDED_P (vinfo) = true;
2583                       DR_GROUP_GAP (vinfo) = 0;
2584                     }
2585                   else
2586                     DR_GROUP_GAP (vinfo) = group_size - 1;
2587                   /* Duplicate and adjust alignment info, it needs to
2588                      be present on each group leader, see dr_misalignment.  */
2589                   if (vinfo != first_element)
2590                     {
2591                       dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2592                       dr_info2->target_alignment = dr_info->target_alignment;
2593                       int misalignment = dr_info->misalignment;
2594                       if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2595                         {
2596                           HOST_WIDE_INT diff
2597                             = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2598                                - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2599                           unsigned HOST_WIDE_INT align_c
2600                             = dr_info->target_alignment.to_constant ();
2601                           misalignment = (misalignment + diff) % align_c;
2602                         }
2603                       dr_info2->misalignment = misalignment;
2604                     }
2605                   vinfo = next;
2606                 }
2607             }
2608         }
2609     }
2610 }
2611
2612 /* Determine if operating on full vectors for LOOP_VINFO might leave
2613    some scalar iterations still to do.  If so, decide how we should
2614    handle those scalar iterations.  The possibilities are:
2615
2616    (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2617        In this case:
2618
2619          LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2620          LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2621          LOOP_VINFO_PEELING_FOR_NITER == false
2622
2623    (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2624        to handle the remaining scalar iterations.  In this case:
2625
2626          LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2627          LOOP_VINFO_PEELING_FOR_NITER == true
2628
2629        There are two choices:
2630
2631        (2a) Consider vectorizing the epilogue loop at the same VF as the
2632             main loop, but using partial vectors instead of full vectors.
2633             In this case:
2634
2635               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2636
2637        (2b) Consider vectorizing the epilogue loop at lower VFs only.
2638             In this case:
2639
2640               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2641  */
2642
2643 opt_result
2644 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo)
2645 {
2646   /* Determine whether there would be any scalar iterations left over.  */
2647   bool need_peeling_or_partial_vectors_p
2648     = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2649
2650   /* Decide whether to vectorize the loop with partial vectors.  */
2651   LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2652   LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2653   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2654       && need_peeling_or_partial_vectors_p)
2655     {
2656       /* For partial-vector-usage=1, try to push the handling of partial
2657          vectors to the epilogue, with the main loop continuing to operate
2658          on full vectors.
2659
2660          If we are unrolling we also do not want to use partial vectors. This
2661          is to avoid the overhead of generating multiple masks and also to
2662          avoid having to execute entire iterations of FALSE masked instructions
2663          when dealing with one or less full iterations.
2664
2665          ??? We could then end up failing to use partial vectors if we
2666          decide to peel iterations into a prologue, and if the main loop
2667          then ends up processing fewer than VF iterations.  */
2668       if ((param_vect_partial_vector_usage == 1
2669            || loop_vinfo->suggested_unroll_factor > 1)
2670           && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2671           && !vect_known_niters_smaller_than_vf (loop_vinfo))
2672         LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2673       else
2674         LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2675     }
2676
2677   if (dump_enabled_p ())
2678     dump_printf_loc (MSG_NOTE, vect_location,
2679                      "operating on %s vectors%s.\n",
2680                      LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2681                      ? "partial" : "full",
2682                      LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2683                      ? " for epilogue loop" : "");
2684
2685   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2686     = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2687        && need_peeling_or_partial_vectors_p);
2688
2689   /* We set LOOP_VINFO_USING_SELECT_VL_P as true before loop vectorization
2690      analysis that we don't know whether the loop is vectorized by partial
2691      vectors (More details see tree-vect-loop-manip.cc).
2692
2693      However, SELECT_VL vectorizaton style should only applied on partial
2694      vectorization since SELECT_VL is the GIMPLE IR that calculates the
2695      number of elements to be process for each iteration.
2696
2697      After loop vectorization analysis, Clear LOOP_VINFO_USING_SELECT_VL_P
2698      if it is not partial vectorized loop.  */
2699   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2700     LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = false;
2701
2702   return opt_result::success ();
2703 }
2704
2705 /* Function vect_analyze_loop_2.
2706
2707    Apply a set of analyses on LOOP specified by LOOP_VINFO, the different
2708    analyses will record information in some members of LOOP_VINFO.  FATAL
2709    indicates if some analysis meets fatal error.  If one non-NULL pointer
2710    SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one
2711    worked out suggested unroll factor, while one NULL pointer shows it's
2712    going to apply the suggested unroll factor.  SLP_DONE_FOR_SUGGESTED_UF
2713    is to hold the slp decision when the suggested unroll factor is worked
2714    out.  */
2715 static opt_result
2716 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
2717                      unsigned *suggested_unroll_factor,
2718                      bool& slp_done_for_suggested_uf)
2719 {
2720   opt_result ok = opt_result::success ();
2721   int res;
2722   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2723   poly_uint64 min_vf = 2;
2724   loop_vec_info orig_loop_vinfo = NULL;
2725
2726   /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2727      loop_vec_info of the first vectorized loop.  */
2728   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2729     orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2730   else
2731     orig_loop_vinfo = loop_vinfo;
2732   gcc_assert (orig_loop_vinfo);
2733
2734   /* The first group of checks is independent of the vector size.  */
2735   fatal = true;
2736
2737   if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2738       && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2739     return opt_result::failure_at (vect_location,
2740                                    "not vectorized: simd if(0)\n");
2741
2742   /* Find all data references in the loop (which correspond to vdefs/vuses)
2743      and analyze their evolution in the loop.  */
2744
2745   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2746
2747   /* Gather the data references and count stmts in the loop.  */
2748   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2749     {
2750       opt_result res
2751         = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2752                                      &LOOP_VINFO_DATAREFS (loop_vinfo),
2753                                      &LOOP_VINFO_N_STMTS (loop_vinfo));
2754       if (!res)
2755         {
2756           if (dump_enabled_p ())
2757             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2758                              "not vectorized: loop contains function "
2759                              "calls or data references that cannot "
2760                              "be analyzed\n");
2761           return res;
2762         }
2763       loop_vinfo->shared->save_datarefs ();
2764     }
2765   else
2766     loop_vinfo->shared->check_datarefs ();
2767
2768   /* Analyze the data references and also adjust the minimal
2769      vectorization factor according to the loads and stores.  */
2770
2771   ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2772   if (!ok)
2773     {
2774       if (dump_enabled_p ())
2775         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2776                          "bad data references.\n");
2777       return ok;
2778     }
2779
2780   /* Check if we are applying unroll factor now.  */
2781   bool applying_suggested_uf = loop_vinfo->suggested_unroll_factor > 1;
2782   gcc_assert (!applying_suggested_uf || !suggested_unroll_factor);
2783
2784   /* If the slp decision is false when suggested unroll factor is worked
2785      out, and we are applying suggested unroll factor, we can simply skip
2786      all slp related analyses this time.  */
2787   bool slp = !applying_suggested_uf || slp_done_for_suggested_uf;
2788
2789   /* Classify all cross-iteration scalar data-flow cycles.
2790      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
2791   vect_analyze_scalar_cycles (loop_vinfo, slp);
2792
2793   vect_pattern_recog (loop_vinfo);
2794
2795   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2796
2797   /* Analyze the access patterns of the data-refs in the loop (consecutive,
2798      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
2799
2800   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2801   if (!ok)
2802     {
2803       if (dump_enabled_p ())
2804         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2805                          "bad data access.\n");
2806       return ok;
2807     }
2808
2809   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
2810
2811   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2812   if (!ok)
2813     {
2814       if (dump_enabled_p ())
2815         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2816                          "unexpected pattern.\n");
2817       return ok;
2818     }
2819
2820   /* While the rest of the analysis below depends on it in some way.  */
2821   fatal = false;
2822
2823   /* Analyze data dependences between the data-refs in the loop
2824      and adjust the maximum vectorization factor according to
2825      the dependences.
2826      FORNOW: fail at the first data dependence that we encounter.  */
2827
2828   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2829   if (!ok)
2830     {
2831       if (dump_enabled_p ())
2832         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2833                          "bad data dependence.\n");
2834       return ok;
2835     }
2836   if (max_vf != MAX_VECTORIZATION_FACTOR
2837       && maybe_lt (max_vf, min_vf))
2838     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2839   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2840
2841   ok = vect_determine_vectorization_factor (loop_vinfo);
2842   if (!ok)
2843     {
2844       if (dump_enabled_p ())
2845         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2846                          "can't determine vectorization factor.\n");
2847       return ok;
2848     }
2849
2850   /* Compute the scalar iteration cost.  */
2851   vect_compute_single_scalar_iteration_cost (loop_vinfo);
2852
2853   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2854
2855   if (slp)
2856     {
2857       /* Check the SLP opportunities in the loop, analyze and build
2858          SLP trees.  */
2859       ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo));
2860       if (!ok)
2861         return ok;
2862
2863       /* If there are any SLP instances mark them as pure_slp.  */
2864       slp = vect_make_slp_decision (loop_vinfo);
2865       if (slp)
2866         {
2867           /* Find stmts that need to be both vectorized and SLPed.  */
2868           vect_detect_hybrid_slp (loop_vinfo);
2869
2870           /* Update the vectorization factor based on the SLP decision.  */
2871           vect_update_vf_for_slp (loop_vinfo);
2872
2873           /* Optimize the SLP graph with the vectorization factor fixed.  */
2874           vect_optimize_slp (loop_vinfo);
2875
2876           /* Gather the loads reachable from the SLP graph entries.  */
2877           vect_gather_slp_loads (loop_vinfo);
2878         }
2879     }
2880
2881   bool saved_can_use_partial_vectors_p
2882     = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2883
2884   /* We don't expect to have to roll back to anything other than an empty
2885      set of rgroups.  */
2886   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2887
2888   /* This is the point where we can re-start analysis with SLP forced off.  */
2889 start_over:
2890
2891   /* Apply the suggested unrolling factor, this was determined by the backend
2892      during finish_cost the first time we ran the analyzis for this
2893      vector mode.  */
2894   if (applying_suggested_uf)
2895     LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2896
2897   /* Now the vectorization factor is final.  */
2898   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2899   gcc_assert (known_ne (vectorization_factor, 0U));
2900
2901   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2902     {
2903       dump_printf_loc (MSG_NOTE, vect_location,
2904                        "vectorization_factor = ");
2905       dump_dec (MSG_NOTE, vectorization_factor);
2906       dump_printf (MSG_NOTE, ", niters = %wd\n",
2907                    LOOP_VINFO_INT_NITERS (loop_vinfo));
2908     }
2909
2910   if (max_vf != MAX_VECTORIZATION_FACTOR
2911       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2912     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2913
2914   loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2915
2916   /* Analyze the alignment of the data-refs in the loop.
2917      Fail if a data reference is found that cannot be vectorized.  */
2918
2919   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2920   if (!ok)
2921     {
2922       if (dump_enabled_p ())
2923         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2924                          "bad data alignment.\n");
2925       return ok;
2926     }
2927
2928   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2929      It is important to call pruning after vect_analyze_data_ref_accesses,
2930      since we use grouping information gathered by interleaving analysis.  */
2931   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2932   if (!ok)
2933     return ok;
2934
2935   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2936      vectorization, since we do not want to add extra peeling or
2937      add versioning for alignment.  */
2938   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2939     /* This pass will decide on using loop versioning and/or loop peeling in
2940        order to enhance the alignment of data references in the loop.  */
2941     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2942   if (!ok)
2943     return ok;
2944
2945   if (slp)
2946     {
2947       /* Analyze operations in the SLP instances.  Note this may
2948          remove unsupported SLP instances which makes the above
2949          SLP kind detection invalid.  */
2950       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2951       vect_slp_analyze_operations (loop_vinfo);
2952       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2953         {
2954           ok = opt_result::failure_at (vect_location,
2955                                        "unsupported SLP instances\n");
2956           goto again;
2957         }
2958
2959       /* Check whether any load in ALL SLP instances is possibly permuted.  */
2960       slp_tree load_node, slp_root;
2961       unsigned i, x;
2962       slp_instance instance;
2963       bool can_use_lanes = true;
2964       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2965         {
2966           slp_root = SLP_INSTANCE_TREE (instance);
2967           int group_size = SLP_TREE_LANES (slp_root);
2968           tree vectype = SLP_TREE_VECTYPE (slp_root);
2969           bool loads_permuted = false;
2970           FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2971             {
2972               if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2973                 continue;
2974               unsigned j;
2975               stmt_vec_info load_info;
2976               FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2977                 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2978                   {
2979                     loads_permuted = true;
2980                     break;
2981                   }
2982             }
2983
2984           /* If the loads and stores can be handled with load/store-lane
2985              instructions record it and move on to the next instance.  */
2986           if (loads_permuted
2987               && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2988               && vect_store_lanes_supported (vectype, group_size, false)
2989                    != IFN_LAST)
2990             {
2991               FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2992                 if (STMT_VINFO_GROUPED_ACCESS
2993                       (SLP_TREE_REPRESENTATIVE (load_node)))
2994                   {
2995                     stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2996                         (SLP_TREE_REPRESENTATIVE (load_node));
2997                     /* Use SLP for strided accesses (or if we can't
2998                        load-lanes).  */
2999                     if (STMT_VINFO_STRIDED_P (stmt_vinfo)
3000                         || vect_load_lanes_supported
3001                              (STMT_VINFO_VECTYPE (stmt_vinfo),
3002                               DR_GROUP_SIZE (stmt_vinfo), false) == IFN_LAST)
3003                       break;
3004                   }
3005
3006               can_use_lanes
3007                 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
3008
3009               if (can_use_lanes && dump_enabled_p ())
3010                 dump_printf_loc (MSG_NOTE, vect_location,
3011                                  "SLP instance %p can use load/store-lanes\n",
3012                                  (void *) instance);
3013             }
3014           else
3015             {
3016               can_use_lanes = false;
3017               break;
3018             }
3019         }
3020
3021       /* If all SLP instances can use load/store-lanes abort SLP and try again
3022          with SLP disabled.  */
3023       if (can_use_lanes)
3024         {
3025           ok = opt_result::failure_at (vect_location,
3026                                        "Built SLP cancelled: can use "
3027                                        "load/store-lanes\n");
3028           if (dump_enabled_p ())
3029             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3030                              "Built SLP cancelled: all SLP instances support "
3031                              "load/store-lanes\n");
3032           goto again;
3033         }
3034     }
3035
3036   /* Dissolve SLP-only groups.  */
3037   vect_dissolve_slp_only_groups (loop_vinfo);
3038
3039   /* Scan all the remaining operations in the loop that are not subject
3040      to SLP and make sure they are vectorizable.  */
3041   ok = vect_analyze_loop_operations (loop_vinfo);
3042   if (!ok)
3043     {
3044       if (dump_enabled_p ())
3045         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3046                          "bad operation or unsupported loop bound.\n");
3047       return ok;
3048     }
3049
3050   /* For now, we don't expect to mix both masking and length approaches for one
3051      loop, disable it if both are recorded.  */
3052   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3053       && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
3054       && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
3055     {
3056       if (dump_enabled_p ())
3057         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3058                          "can't vectorize a loop with partial vectors"
3059                          " because we don't expect to mix different"
3060                          " approaches with partial vectors for the"
3061                          " same loop.\n");
3062       LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3063     }
3064
3065   /* If we still have the option of using partial vectors,
3066      check whether we can generate the necessary loop controls.  */
3067   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
3068     {
3069       if (!LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
3070         {
3071           if (!vect_verify_full_masking (loop_vinfo)
3072               && !vect_verify_full_masking_avx512 (loop_vinfo))
3073             LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3074         }
3075       else /* !LOOP_VINFO_LENS (loop_vinfo).is_empty () */
3076         if (!vect_verify_loop_lens (loop_vinfo))
3077           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3078     }
3079
3080   /* If we're vectorizing a loop that uses length "controls" and
3081      can iterate more than once, we apply decrementing IV approach
3082      in loop control.  */
3083   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3084       && LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_len
3085       && LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0
3086       && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3087            && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
3088                         LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
3089     LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
3090
3091   /* If a loop uses length controls and has a decrementing loop control IV,
3092      we will normally pass that IV through a MIN_EXPR to calcaluate the
3093      basis for the length controls.  E.g. in a loop that processes one
3094      element per scalar iteration, the number of elements would be
3095      MIN_EXPR <N, VF>, where N is the number of scalar iterations left.
3096
3097      This MIN_EXPR approach allows us to use pointer IVs with an invariant
3098      step, since only the final iteration of the vector loop can have
3099      inactive lanes.
3100
3101      However, some targets have a dedicated instruction for calculating the
3102      preferred length, given the total number of elements that still need to
3103      be processed.  This is encapsulated in the SELECT_VL internal function.
3104
3105      If the target supports SELECT_VL, we can use it instead of MIN_EXPR
3106      to determine the basis for the length controls.  However, unlike the
3107      MIN_EXPR calculation, the SELECT_VL calculation can decide to make
3108      lanes inactive in any iteration of the vector loop, not just the last
3109      iteration.  This SELECT_VL approach therefore requires us to use pointer
3110      IVs with variable steps.
3111
3112      Once we've decided how many elements should be processed by one
3113      iteration of the vector loop, we need to populate the rgroup controls.
3114      If a loop has multiple rgroups, we need to make sure that those rgroups
3115      "line up" (that is, they must be consistent about which elements are
3116      active and which aren't).  This is done by vect_adjust_loop_lens_control.
3117
3118      In principle, it would be possible to use vect_adjust_loop_lens_control
3119      on either the result of a MIN_EXPR or the result of a SELECT_VL.
3120      However:
3121
3122      (1) In practice, it only makes sense to use SELECT_VL when a vector
3123          operation will be controlled directly by the result.  It is not
3124          worth using SELECT_VL if it would only be the input to other
3125          calculations.
3126
3127      (2) If we use SELECT_VL for an rgroup that has N controls, each associated
3128          pointer IV will need N updates by a variable amount (N-1 updates
3129          within the iteration and 1 update to move to the next iteration).
3130
3131      Because of this, we prefer to use the MIN_EXPR approach whenever there
3132      is more than one length control.
3133
3134      In addition, SELECT_VL always operates to a granularity of 1 unit.
3135      If we wanted to use it to control an SLP operation on N consecutive
3136      elements, we would need to make the SELECT_VL inputs measure scalar
3137      iterations (rather than elements) and then multiply the SELECT_VL
3138      result by N.  But using SELECT_VL this way is inefficient because
3139      of (1) above.
3140
3141      2. We don't apply SELECT_VL on single-rgroup when both (1) and (2) are
3142         satisfied:
3143
3144      (1). LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) is true.
3145      (2). LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () is true.
3146
3147      Since SELECT_VL (variable step) will make SCEV analysis failed and then
3148      we will fail to gain benefits of following unroll optimizations. We prefer
3149      using the MIN_EXPR approach in this situation.  */
3150   if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
3151     {
3152       tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
3153       if (direct_internal_fn_supported_p (IFN_SELECT_VL, iv_type,
3154                                           OPTIMIZE_FOR_SPEED)
3155           && LOOP_VINFO_LENS (loop_vinfo).length () == 1
3156           && LOOP_VINFO_LENS (loop_vinfo)[0].factor == 1 && !slp
3157           && (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3158               || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()))
3159         LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true;
3160     }
3161
3162   /* Decide whether this loop_vinfo should use partial vectors or peeling,
3163      assuming that the loop will be used as a main loop.  We will redo
3164      this analysis later if we instead decide to use the loop as an
3165      epilogue loop.  */
3166   ok = vect_determine_partial_vectors_and_peeling (loop_vinfo);
3167   if (!ok)
3168     return ok;
3169
3170   /* If we're vectorizing an epilogue loop, the vectorized loop either needs
3171      to be able to handle fewer than VF scalars, or needs to have a lower VF
3172      than the main loop.  */
3173   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
3174       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3175     {
3176       poly_uint64 unscaled_vf
3177         = exact_div (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
3178                      orig_loop_vinfo->suggested_unroll_factor);
3179       if (maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo), unscaled_vf))
3180         return opt_result::failure_at (vect_location,
3181                                        "Vectorization factor too high for"
3182                                        " epilogue loop.\n");
3183     }
3184
3185   /* Check the costings of the loop make vectorizing worthwhile.  */
3186   res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
3187   if (res < 0)
3188     {
3189       ok = opt_result::failure_at (vect_location,
3190                                    "Loop costings may not be worthwhile.\n");
3191       goto again;
3192     }
3193   if (!res)
3194     return opt_result::failure_at (vect_location,
3195                                    "Loop costings not worthwhile.\n");
3196
3197   /* If an epilogue loop is required make sure we can create one.  */
3198   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
3199       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
3200       || LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3201     {
3202       if (dump_enabled_p ())
3203         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
3204       if (!vect_can_advance_ivs_p (loop_vinfo)
3205           || !slpeel_can_duplicate_loop_p (loop,
3206                                            LOOP_VINFO_IV_EXIT (loop_vinfo),
3207                                            LOOP_VINFO_IV_EXIT (loop_vinfo)))
3208         {
3209           ok = opt_result::failure_at (vect_location,
3210                                        "not vectorized: can't create required "
3211                                        "epilog loop\n");
3212           goto again;
3213         }
3214     }
3215
3216   /* During peeling, we need to check if number of loop iterations is
3217      enough for both peeled prolog loop and vector loop.  This check
3218      can be merged along with threshold check of loop versioning, so
3219      increase threshold for this case if necessary.
3220
3221      If we are analyzing an epilogue we still want to check what its
3222      versioning threshold would be.  If we decide to vectorize the epilogues we
3223      will want to use the lowest versioning threshold of all epilogues and main
3224      loop.  This will enable us to enter a vectorized epilogue even when
3225      versioning the loop.  We can't simply check whether the epilogue requires
3226      versioning though since we may have skipped some versioning checks when
3227      analyzing the epilogue.  For instance, checks for alias versioning will be
3228      skipped when dealing with epilogues as we assume we already checked them
3229      for the main loop.  So instead we always check the 'orig_loop_vinfo'.  */
3230   if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
3231     {
3232       poly_uint64 niters_th = 0;
3233       unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3234
3235       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
3236         {
3237           /* Niters for peeled prolog loop.  */
3238           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3239             {
3240               dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
3241               tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
3242               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
3243             }
3244           else
3245             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3246         }
3247
3248       /* Niters for at least one iteration of vectorized loop.  */
3249       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3250         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3251       /* One additional iteration because of peeling for gap.  */
3252       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3253         niters_th += 1;
3254
3255       /*  Use the same condition as vect_transform_loop to decide when to use
3256           the cost to determine a versioning threshold.  */
3257       if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3258           && ordered_p (th, niters_th))
3259         niters_th = ordered_max (poly_uint64 (th), niters_th);
3260
3261       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
3262     }
3263
3264   gcc_assert (known_eq (vectorization_factor,
3265                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
3266
3267   slp_done_for_suggested_uf = slp;
3268
3269   /* Ok to vectorize!  */
3270   LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
3271   return opt_result::success ();
3272
3273 again:
3274   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
3275   gcc_assert (!ok);
3276
3277   /* Try again with SLP forced off but if we didn't do any SLP there is
3278      no point in re-trying.  */
3279   if (!slp)
3280     return ok;
3281
3282   /* If the slp decision is true when suggested unroll factor is worked
3283      out, and we are applying suggested unroll factor, we don't need to
3284      re-try any more.  */
3285   if (applying_suggested_uf && slp_done_for_suggested_uf)
3286     return ok;
3287
3288   /* If there are reduction chains re-trying will fail anyway.  */
3289   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
3290     return ok;
3291
3292   /* Likewise if the grouped loads or stores in the SLP cannot be handled
3293      via interleaving or lane instructions.  */
3294   slp_instance instance;
3295   slp_tree node;
3296   unsigned i, j;
3297   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
3298     {
3299       stmt_vec_info vinfo;
3300       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
3301       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
3302         continue;
3303       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3304       unsigned int size = DR_GROUP_SIZE (vinfo);
3305       tree vectype = STMT_VINFO_VECTYPE (vinfo);
3306       if (vect_store_lanes_supported (vectype, size, false) == IFN_LAST
3307          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
3308          && ! vect_grouped_store_supported (vectype, size))
3309         return opt_result::failure_at (vinfo->stmt,
3310                                        "unsupported grouped store\n");
3311       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
3312         {
3313           vinfo = SLP_TREE_REPRESENTATIVE (node);
3314           if (STMT_VINFO_GROUPED_ACCESS (vinfo))
3315             {
3316               vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3317               bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
3318               size = DR_GROUP_SIZE (vinfo);
3319               vectype = STMT_VINFO_VECTYPE (vinfo);
3320               if (vect_load_lanes_supported (vectype, size, false) == IFN_LAST
3321                   && ! vect_grouped_load_supported (vectype, single_element_p,
3322                                                     size))
3323                 return opt_result::failure_at (vinfo->stmt,
3324                                                "unsupported grouped load\n");
3325             }
3326         }
3327     }
3328
3329   if (dump_enabled_p ())
3330     dump_printf_loc (MSG_NOTE, vect_location,
3331                      "re-trying with SLP disabled\n");
3332
3333   /* Roll back state appropriately.  No SLP this time.  */
3334   slp = false;
3335   /* Restore vectorization factor as it were without SLP.  */
3336   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
3337   /* Free the SLP instances.  */
3338   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
3339     vect_free_slp_instance (instance);
3340   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
3341   /* Reset SLP type to loop_vect on all stmts.  */
3342   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
3343     {
3344       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3345       for (gimple_stmt_iterator si = gsi_start_phis (bb);
3346            !gsi_end_p (si); gsi_next (&si))
3347         {
3348           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3349           STMT_SLP_TYPE (stmt_info) = loop_vect;
3350           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
3351               || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
3352             {
3353               /* vectorizable_reduction adjusts reduction stmt def-types,
3354                  restore them to that of the PHI.  */
3355               STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
3356                 = STMT_VINFO_DEF_TYPE (stmt_info);
3357               STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
3358                                         (STMT_VINFO_REDUC_DEF (stmt_info)))
3359                 = STMT_VINFO_DEF_TYPE (stmt_info);
3360             }
3361         }
3362       for (gimple_stmt_iterator si = gsi_start_bb (bb);
3363            !gsi_end_p (si); gsi_next (&si))
3364         {
3365           if (is_gimple_debug (gsi_stmt (si)))
3366             continue;
3367           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3368           STMT_SLP_TYPE (stmt_info) = loop_vect;
3369           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
3370             {
3371               stmt_vec_info pattern_stmt_info
3372                 = STMT_VINFO_RELATED_STMT (stmt_info);
3373               if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
3374                 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
3375
3376               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
3377               STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
3378               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
3379                    !gsi_end_p (pi); gsi_next (&pi))
3380                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
3381                   = loop_vect;
3382             }
3383         }
3384     }
3385   /* Free optimized alias test DDRS.  */
3386   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
3387   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
3388   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
3389   /* Reset target cost data.  */
3390   delete loop_vinfo->vector_costs;
3391   loop_vinfo->vector_costs = nullptr;
3392   /* Reset accumulated rgroup information.  */
3393   LOOP_VINFO_MASKS (loop_vinfo).mask_set.empty ();
3394   release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
3395   release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
3396   /* Reset assorted flags.  */
3397   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
3398   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
3399   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
3400   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
3401   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3402     = saved_can_use_partial_vectors_p;
3403
3404   goto start_over;
3405 }
3406
3407 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
3408    to be better than vectorizing it using OLD_LOOP_VINFO.  Assume that
3409    OLD_LOOP_VINFO is better unless something specifically indicates
3410    otherwise.
3411
3412    Note that this deliberately isn't a partial order.  */
3413
3414 static bool
3415 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
3416                           loop_vec_info old_loop_vinfo)
3417 {
3418   struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
3419   gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
3420
3421   poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
3422   poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
3423
3424   /* Always prefer a VF of loop->simdlen over any other VF.  */
3425   if (loop->simdlen)
3426     {
3427       bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
3428       bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
3429       if (new_simdlen_p != old_simdlen_p)
3430         return new_simdlen_p;
3431     }
3432
3433   const auto *old_costs = old_loop_vinfo->vector_costs;
3434   const auto *new_costs = new_loop_vinfo->vector_costs;
3435   if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
3436     return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
3437
3438   return new_costs->better_main_loop_than_p (old_costs);
3439 }
3440
3441 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO.  Return
3442    true if we should.  */
3443
3444 static bool
3445 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
3446                         loop_vec_info old_loop_vinfo)
3447 {
3448   if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
3449     return false;
3450
3451   if (dump_enabled_p ())
3452     dump_printf_loc (MSG_NOTE, vect_location,
3453                      "***** Preferring vector mode %s to vector mode %s\n",
3454                      GET_MODE_NAME (new_loop_vinfo->vector_mode),
3455                      GET_MODE_NAME (old_loop_vinfo->vector_mode));
3456   return true;
3457 }
3458
3459 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
3460    not NULL.  Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
3461    MODE_I to the next mode useful to analyze.
3462    Return the loop_vinfo on success and wrapped null on failure.  */
3463
3464 static opt_loop_vec_info
3465 vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
3466                      const vect_loop_form_info *loop_form_info,
3467                      loop_vec_info main_loop_vinfo,
3468                      const vector_modes &vector_modes, unsigned &mode_i,
3469                      machine_mode &autodetected_vector_mode,
3470                      bool &fatal)
3471 {
3472   loop_vec_info loop_vinfo
3473     = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3474
3475   machine_mode vector_mode = vector_modes[mode_i];
3476   loop_vinfo->vector_mode = vector_mode;
3477   unsigned int suggested_unroll_factor = 1;
3478   bool slp_done_for_suggested_uf = false;
3479
3480   /* Run the main analysis.  */
3481   opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,
3482                                         &suggested_unroll_factor,
3483                                         slp_done_for_suggested_uf);
3484   if (dump_enabled_p ())
3485     dump_printf_loc (MSG_NOTE, vect_location,
3486                      "***** Analysis %s with vector mode %s\n",
3487                      res ? "succeeded" : " failed",
3488                      GET_MODE_NAME (loop_vinfo->vector_mode));
3489
3490   if (res && !main_loop_vinfo && suggested_unroll_factor > 1)
3491     {
3492       if (dump_enabled_p ())
3493         dump_printf_loc (MSG_NOTE, vect_location,
3494                          "***** Re-trying analysis for unrolling"
3495                          " with unroll factor %d and slp %s.\n",
3496                          suggested_unroll_factor,
3497                          slp_done_for_suggested_uf ? "on" : "off");
3498       loop_vec_info unroll_vinfo
3499         = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3500       unroll_vinfo->vector_mode = vector_mode;
3501       unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
3502       opt_result new_res = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL,
3503                                                 slp_done_for_suggested_uf);
3504       if (new_res)
3505         {
3506           delete loop_vinfo;
3507           loop_vinfo = unroll_vinfo;
3508         }
3509       else
3510         delete unroll_vinfo;
3511     }
3512
3513   /* Remember the autodetected vector mode.  */
3514   if (vector_mode == VOIDmode)
3515     autodetected_vector_mode = loop_vinfo->vector_mode;
3516
3517   /* Advance mode_i, first skipping modes that would result in the
3518      same analysis result.  */
3519   while (mode_i + 1 < vector_modes.length ()
3520          && vect_chooses_same_modes_p (loop_vinfo,
3521                                        vector_modes[mode_i + 1]))
3522     {
3523       if (dump_enabled_p ())
3524         dump_printf_loc (MSG_NOTE, vect_location,
3525                          "***** The result for vector mode %s would"
3526                          " be the same\n",
3527                          GET_MODE_NAME (vector_modes[mode_i + 1]));
3528       mode_i += 1;
3529     }
3530   if (mode_i + 1 < vector_modes.length ()
3531       && VECTOR_MODE_P (autodetected_vector_mode)
3532       && (related_vector_mode (vector_modes[mode_i + 1],
3533                                GET_MODE_INNER (autodetected_vector_mode))
3534           == autodetected_vector_mode)
3535       && (related_vector_mode (autodetected_vector_mode,
3536                                GET_MODE_INNER (vector_modes[mode_i + 1]))
3537           == vector_modes[mode_i + 1]))
3538     {
3539       if (dump_enabled_p ())
3540         dump_printf_loc (MSG_NOTE, vect_location,
3541                          "***** Skipping vector mode %s, which would"
3542                          " repeat the analysis for %s\n",
3543                          GET_MODE_NAME (vector_modes[mode_i + 1]),
3544                          GET_MODE_NAME (autodetected_vector_mode));
3545       mode_i += 1;
3546     }
3547   mode_i++;
3548
3549   if (!res)
3550     {
3551       delete loop_vinfo;
3552       if (fatal)
3553         gcc_checking_assert (main_loop_vinfo == NULL);
3554       return opt_loop_vec_info::propagate_failure (res);
3555     }
3556
3557   return opt_loop_vec_info::success (loop_vinfo);
3558 }
3559
3560 /* Function vect_analyze_loop.
3561
3562    Apply a set of analyses on LOOP, and create a loop_vec_info struct
3563    for it.  The different analyses will record information in the
3564    loop_vec_info struct.  */
3565 opt_loop_vec_info
3566 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
3567 {
3568   DUMP_VECT_SCOPE ("analyze_loop_nest");
3569
3570   if (loop_outer (loop)
3571       && loop_vec_info_for_loop (loop_outer (loop))
3572       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
3573     return opt_loop_vec_info::failure_at (vect_location,
3574                                           "outer-loop already vectorized.\n");
3575
3576   if (!find_loop_nest (loop, &shared->loop_nest))
3577     return opt_loop_vec_info::failure_at
3578       (vect_location,
3579        "not vectorized: loop nest containing two or more consecutive inner"
3580        " loops cannot be vectorized\n");
3581
3582   /* Analyze the loop form.  */
3583   vect_loop_form_info loop_form_info;
3584   opt_result res = vect_analyze_loop_form (loop, &loop_form_info);
3585   if (!res)
3586     {
3587       if (dump_enabled_p ())
3588         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3589                          "bad loop form.\n");
3590       return opt_loop_vec_info::propagate_failure (res);
3591     }
3592   if (!integer_onep (loop_form_info.assumptions))
3593     {
3594       /* We consider to vectorize this loop by versioning it under
3595          some assumptions.  In order to do this, we need to clear
3596          existing information computed by scev and niter analyzer.  */
3597       scev_reset_htab ();
3598       free_numbers_of_iterations_estimates (loop);
3599       /* Also set flag for this loop so that following scev and niter
3600          analysis are done under the assumptions.  */
3601       loop_constraint_set (loop, LOOP_C_FINITE);
3602     }
3603   else
3604     /* Clear the existing niter information to make sure the nonwrapping flag
3605        will be calculated and set propriately.  */
3606     free_numbers_of_iterations_estimates (loop);
3607
3608   auto_vector_modes vector_modes;
3609   /* Autodetect first vector size we try.  */
3610   vector_modes.safe_push (VOIDmode);
3611   unsigned int autovec_flags
3612     = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
3613                                                     loop->simdlen != 0);
3614   bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
3615                              && !unlimited_cost_model (loop));
3616   machine_mode autodetected_vector_mode = VOIDmode;
3617   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
3618   unsigned int mode_i = 0;
3619   unsigned HOST_WIDE_INT simdlen = loop->simdlen;
3620
3621   /* Keep track of the VF for each mode.  Initialize all to 0 which indicates
3622      a mode has not been analyzed.  */
3623   auto_vec<poly_uint64, 8> cached_vf_per_mode;
3624   for (unsigned i = 0; i < vector_modes.length (); ++i)
3625     cached_vf_per_mode.safe_push (0);
3626
3627   /* First determine the main loop vectorization mode, either the first
3628      one that works, starting with auto-detecting the vector mode and then
3629      following the targets order of preference, or the one with the
3630      lowest cost if pick_lowest_cost_p.  */
3631   while (1)
3632     {
3633       bool fatal;
3634       unsigned int last_mode_i = mode_i;
3635       /* Set cached VF to -1 prior to analysis, which indicates a mode has
3636          failed.  */
3637       cached_vf_per_mode[last_mode_i] = -1;
3638       opt_loop_vec_info loop_vinfo
3639         = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3640                                NULL, vector_modes, mode_i,
3641                                autodetected_vector_mode, fatal);
3642       if (fatal)
3643         break;
3644
3645       if (loop_vinfo)
3646         {
3647           /*  Analyzis has been successful so update the VF value.  The
3648               VF should always be a multiple of unroll_factor and we want to
3649               capture the original VF here.  */
3650           cached_vf_per_mode[last_mode_i]
3651             = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
3652                          loop_vinfo->suggested_unroll_factor);
3653           /* Once we hit the desired simdlen for the first time,
3654              discard any previous attempts.  */
3655           if (simdlen
3656               && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3657             {
3658               delete first_loop_vinfo;
3659               first_loop_vinfo = opt_loop_vec_info::success (NULL);
3660               simdlen = 0;
3661             }
3662           else if (pick_lowest_cost_p
3663                    && first_loop_vinfo
3664                    && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3665             {
3666               /* Pick loop_vinfo over first_loop_vinfo.  */
3667               delete first_loop_vinfo;
3668               first_loop_vinfo = opt_loop_vec_info::success (NULL);
3669             }
3670           if (first_loop_vinfo == NULL)
3671             first_loop_vinfo = loop_vinfo;
3672           else
3673             {
3674               delete loop_vinfo;
3675               loop_vinfo = opt_loop_vec_info::success (NULL);
3676             }
3677
3678           /* Commit to first_loop_vinfo if we have no reason to try
3679              alternatives.  */
3680           if (!simdlen && !pick_lowest_cost_p)
3681             break;
3682         }
3683       if (mode_i == vector_modes.length ()
3684           || autodetected_vector_mode == VOIDmode)
3685         break;
3686
3687       /* Try the next biggest vector size.  */
3688       if (dump_enabled_p ())
3689         dump_printf_loc (MSG_NOTE, vect_location,
3690                          "***** Re-trying analysis with vector mode %s\n",
3691                          GET_MODE_NAME (vector_modes[mode_i]));
3692     }
3693   if (!first_loop_vinfo)
3694     return opt_loop_vec_info::propagate_failure (res);
3695
3696   if (dump_enabled_p ())
3697     dump_printf_loc (MSG_NOTE, vect_location,
3698                      "***** Choosing vector mode %s\n",
3699                      GET_MODE_NAME (first_loop_vinfo->vector_mode));
3700
3701   /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3702      enabled, SIMDUID is not set, it is the innermost loop and we have
3703      either already found the loop's SIMDLEN or there was no SIMDLEN to
3704      begin with.
3705      TODO: Enable epilogue vectorization for loops with SIMDUID set.  */
3706   bool vect_epilogues = (!simdlen
3707                          && loop->inner == NULL
3708                          && param_vect_epilogues_nomask
3709                          && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3710                            /* No code motion support for multiple epilogues so for now
3711                               not supported when multiple exits.  */
3712                          && !LOOP_VINFO_EARLY_BREAKS (first_loop_vinfo)
3713                          && !loop->simduid);
3714   if (!vect_epilogues)
3715     return first_loop_vinfo;
3716
3717   /* Now analyze first_loop_vinfo for epilogue vectorization.  */
3718   poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3719
3720   /* For epilogues start the analysis from the first mode.  The motivation
3721      behind starting from the beginning comes from cases where the VECTOR_MODES
3722      array may contain length-agnostic and length-specific modes.  Their
3723      ordering is not guaranteed, so we could end up picking a mode for the main
3724      loop that is after the epilogue's optimal mode.  */
3725   vector_modes[0] = autodetected_vector_mode;
3726   mode_i = 0;
3727
3728   bool supports_partial_vectors =
3729     partial_vectors_supported_p () && param_vect_partial_vector_usage != 0;
3730   poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3731
3732   while (1)
3733     {
3734       /* If the target does not support partial vectors we can shorten the
3735          number of modes to analyze for the epilogue as we know we can't pick a
3736          mode that would lead to a VF at least as big as the
3737          FIRST_VINFO_VF.  */
3738       if (!supports_partial_vectors
3739           && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3740         {
3741           mode_i++;
3742           if (mode_i == vector_modes.length ())
3743             break;
3744           continue;
3745         }
3746
3747       if (dump_enabled_p ())
3748         dump_printf_loc (MSG_NOTE, vect_location,
3749                          "***** Re-trying epilogue analysis with vector "
3750                          "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3751
3752       bool fatal;
3753       opt_loop_vec_info loop_vinfo
3754         = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3755                                first_loop_vinfo,
3756                                vector_modes, mode_i,
3757                                autodetected_vector_mode, fatal);
3758       if (fatal)
3759         break;
3760
3761       if (loop_vinfo)
3762         {
3763           if (pick_lowest_cost_p)
3764             {
3765               /* Keep trying to roll back vectorization attempts while the
3766                  loop_vec_infos they produced were worse than this one.  */
3767               vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3768               while (!vinfos.is_empty ()
3769                      && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3770                 {
3771                   gcc_assert (vect_epilogues);
3772                   delete vinfos.pop ();
3773                 }
3774             }
3775           /* For now only allow one epilogue loop.  */
3776           if (first_loop_vinfo->epilogue_vinfos.is_empty ())
3777             {
3778               first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3779               poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3780               gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3781                           || maybe_ne (lowest_th, 0U));
3782               /* Keep track of the known smallest versioning
3783                  threshold.  */
3784               if (ordered_p (lowest_th, th))
3785                 lowest_th = ordered_min (lowest_th, th);
3786             }
3787           else
3788             {
3789               delete loop_vinfo;
3790               loop_vinfo = opt_loop_vec_info::success (NULL);
3791             }
3792
3793           /* For now only allow one epilogue loop, but allow
3794              pick_lowest_cost_p to replace it, so commit to the
3795              first epilogue if we have no reason to try alternatives.  */
3796           if (!pick_lowest_cost_p)
3797             break;
3798         }
3799
3800       if (mode_i == vector_modes.length ())
3801         break;
3802
3803     }
3804
3805   if (!first_loop_vinfo->epilogue_vinfos.is_empty ())
3806     {
3807       LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3808       if (dump_enabled_p ())
3809         dump_printf_loc (MSG_NOTE, vect_location,
3810                          "***** Choosing epilogue vector mode %s\n",
3811                          GET_MODE_NAME
3812                            (first_loop_vinfo->epilogue_vinfos[0]->vector_mode));
3813     }
3814
3815   return first_loop_vinfo;
3816 }
3817
3818 /* Return true if there is an in-order reduction function for CODE, storing
3819    it in *REDUC_FN if so.  */
3820
3821 static bool
3822 fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3823 {
3824   /* We support MINUS_EXPR by negating the operand.  This also preserves an
3825      initial -0.0 since -0.0 - 0.0 (neutral op for MINUS_EXPR) == -0.0 +
3826      (-0.0) = -0.0.  */
3827   if (code == PLUS_EXPR || code == MINUS_EXPR)
3828     {
3829       *reduc_fn = IFN_FOLD_LEFT_PLUS;
3830       return true;
3831     }
3832   return false;
3833 }
3834
3835 /* Function reduction_fn_for_scalar_code
3836
3837    Input:
3838    CODE - tree_code of a reduction operations.
3839
3840    Output:
3841    REDUC_FN - the corresponding internal function to be used to reduce the
3842       vector of partial results into a single scalar result, or IFN_LAST
3843       if the operation is a supported reduction operation, but does not have
3844       such an internal function.
3845
3846    Return FALSE if CODE currently cannot be vectorized as reduction.  */
3847
3848 bool
3849 reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3850 {
3851   if (code.is_tree_code ())
3852     switch (tree_code (code))
3853       {
3854       case MAX_EXPR:
3855         *reduc_fn = IFN_REDUC_MAX;
3856         return true;
3857
3858       case MIN_EXPR:
3859         *reduc_fn = IFN_REDUC_MIN;
3860         return true;
3861
3862       case PLUS_EXPR:
3863         *reduc_fn = IFN_REDUC_PLUS;
3864         return true;
3865
3866       case BIT_AND_EXPR:
3867         *reduc_fn = IFN_REDUC_AND;
3868         return true;
3869
3870       case BIT_IOR_EXPR:
3871         *reduc_fn = IFN_REDUC_IOR;
3872         return true;
3873
3874       case BIT_XOR_EXPR:
3875         *reduc_fn = IFN_REDUC_XOR;
3876         return true;
3877
3878       case MULT_EXPR:
3879       case MINUS_EXPR:
3880         *reduc_fn = IFN_LAST;
3881         return true;
3882
3883       default:
3884         return false;
3885       }
3886   else
3887     switch (combined_fn (code))
3888       {
3889       CASE_CFN_FMAX:
3890         *reduc_fn = IFN_REDUC_FMAX;
3891         return true;
3892
3893       CASE_CFN_FMIN:
3894         *reduc_fn = IFN_REDUC_FMIN;
3895         return true;
3896
3897       default:
3898         return false;
3899       }
3900 }
3901
3902 /* If there is a neutral value X such that a reduction would not be affected
3903    by the introduction of additional X elements, return that X, otherwise
3904    return null.  CODE is the code of the reduction and SCALAR_TYPE is type
3905    of the scalar elements.  If the reduction has just a single initial value
3906    then INITIAL_VALUE is that value, otherwise it is null.
3907    If AS_INITIAL is TRUE the value is supposed to be used as initial value.
3908    In that case no signed zero is returned.  */
3909
3910 tree
3911 neutral_op_for_reduction (tree scalar_type, code_helper code,
3912                           tree initial_value, bool as_initial)
3913 {
3914   if (code.is_tree_code ())
3915     switch (tree_code (code))
3916       {
3917       case DOT_PROD_EXPR:
3918       case SAD_EXPR:
3919       case MINUS_EXPR:
3920       case BIT_IOR_EXPR:
3921       case BIT_XOR_EXPR:
3922         return build_zero_cst (scalar_type);
3923       case WIDEN_SUM_EXPR:
3924       case PLUS_EXPR:
3925         if (!as_initial && HONOR_SIGNED_ZEROS (scalar_type))
3926           return build_real (scalar_type, dconstm0);
3927         else
3928           return build_zero_cst (scalar_type);
3929
3930       case MULT_EXPR:
3931         return build_one_cst (scalar_type);
3932
3933       case BIT_AND_EXPR:
3934         return build_all_ones_cst (scalar_type);
3935
3936       case MAX_EXPR:
3937       case MIN_EXPR:
3938         return initial_value;
3939
3940       default:
3941         return NULL_TREE;
3942       }
3943   else
3944     switch (combined_fn (code))
3945       {
3946       CASE_CFN_FMIN:
3947       CASE_CFN_FMAX:
3948         return initial_value;
3949
3950       default:
3951         return NULL_TREE;
3952       }
3953 }
3954
3955 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
3956    STMT is printed with a message MSG. */
3957
3958 static void
3959 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3960 {
3961   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3962 }
3963
3964 /* Return true if we need an in-order reduction for operation CODE
3965    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3966    overflow must wrap.  */
3967
3968 bool
3969 needs_fold_left_reduction_p (tree type, code_helper code)
3970 {
3971   /* CHECKME: check for !flag_finite_math_only too?  */
3972   if (SCALAR_FLOAT_TYPE_P (type))
3973     {
3974       if (code.is_tree_code ())
3975         switch (tree_code (code))
3976           {
3977           case MIN_EXPR:
3978           case MAX_EXPR:
3979             return false;
3980
3981           default:
3982             return !flag_associative_math;
3983           }
3984       else
3985         switch (combined_fn (code))
3986           {
3987           CASE_CFN_FMIN:
3988           CASE_CFN_FMAX:
3989             return false;
3990
3991           default:
3992             return !flag_associative_math;
3993           }
3994     }
3995
3996   if (INTEGRAL_TYPE_P (type))
3997     return (!code.is_tree_code ()
3998             || !operation_no_trapping_overflow (type, tree_code (code)));
3999
4000   if (SAT_FIXED_POINT_TYPE_P (type))
4001     return true;
4002
4003   return false;
4004 }
4005
4006 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
4007    has a handled computation expression.  Store the main reduction
4008    operation in *CODE.  */
4009
4010 static bool
4011 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
4012                       tree loop_arg, code_helper *code,
4013                       vec<std::pair<ssa_op_iter, use_operand_p> > &path)
4014 {
4015   auto_bitmap visited;
4016   tree lookfor = PHI_RESULT (phi);
4017   ssa_op_iter curri;
4018   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
4019   while (USE_FROM_PTR (curr) != loop_arg)
4020     curr = op_iter_next_use (&curri);
4021   curri.i = curri.numops;
4022   do
4023     {
4024       path.safe_push (std::make_pair (curri, curr));
4025       tree use = USE_FROM_PTR (curr);
4026       if (use == lookfor)
4027         break;
4028       gimple *def = SSA_NAME_DEF_STMT (use);
4029       if (gimple_nop_p (def)
4030           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
4031         {
4032 pop:
4033           do
4034             {
4035               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
4036               curri = x.first;
4037               curr = x.second;
4038               do
4039                 curr = op_iter_next_use (&curri);
4040               /* Skip already visited or non-SSA operands (from iterating
4041                  over PHI args).  */
4042               while (curr != NULL_USE_OPERAND_P
4043                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
4044                          || ! bitmap_set_bit (visited,
4045                                               SSA_NAME_VERSION
4046                                                 (USE_FROM_PTR (curr)))));
4047             }
4048           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
4049           if (curr == NULL_USE_OPERAND_P)
4050             break;
4051         }
4052       else
4053         {
4054           if (gimple_code (def) == GIMPLE_PHI)
4055             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
4056           else
4057             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
4058           while (curr != NULL_USE_OPERAND_P
4059                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
4060                      || ! bitmap_set_bit (visited,
4061                                           SSA_NAME_VERSION
4062                                             (USE_FROM_PTR (curr)))))
4063             curr = op_iter_next_use (&curri);
4064           if (curr == NULL_USE_OPERAND_P)
4065             goto pop;
4066         }
4067     }
4068   while (1);
4069   if (dump_file && (dump_flags & TDF_DETAILS))
4070     {
4071       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
4072       unsigned i;
4073       std::pair<ssa_op_iter, use_operand_p> *x;
4074       FOR_EACH_VEC_ELT (path, i, x)
4075         dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
4076       dump_printf (MSG_NOTE, "\n");
4077     }
4078
4079   /* Check whether the reduction path detected is valid.  */
4080   bool fail = path.length () == 0;
4081   bool neg = false;
4082   int sign = -1;
4083   *code = ERROR_MARK;
4084   for (unsigned i = 1; i < path.length (); ++i)
4085     {
4086       gimple *use_stmt = USE_STMT (path[i].second);
4087       gimple_match_op op;
4088       if (!gimple_extract_op (use_stmt, &op))
4089         {
4090           fail = true;
4091           break;
4092         }
4093       unsigned int opi = op.num_ops;
4094       if (gassign *assign = dyn_cast<gassign *> (use_stmt))
4095         {
4096           /* The following make sure we can compute the operand index
4097              easily plus it mostly disallows chaining via COND_EXPR condition
4098              operands.  */
4099           for (opi = 0; opi < op.num_ops; ++opi)
4100             if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
4101               break;
4102         }
4103       else if (gcall *call = dyn_cast<gcall *> (use_stmt))
4104         {
4105           for (opi = 0; opi < op.num_ops; ++opi)
4106             if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
4107               break;
4108         }
4109       if (opi == op.num_ops)
4110         {
4111           fail = true;
4112           break;
4113         }
4114       op.code = canonicalize_code (op.code, op.type);
4115       if (op.code == MINUS_EXPR)
4116         {
4117           op.code = PLUS_EXPR;
4118           /* Track whether we negate the reduction value each iteration.  */
4119           if (op.ops[1] == op.ops[opi])
4120             neg = ! neg;
4121         }
4122       else if (op.code == IFN_COND_SUB)
4123         {
4124           op.code = IFN_COND_ADD;
4125           /* Track whether we negate the reduction value each iteration.  */
4126           if (op.ops[2] == op.ops[opi])
4127             neg = ! neg;
4128         }
4129       if (CONVERT_EXPR_CODE_P (op.code)
4130           && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
4131         ;
4132       else if (*code == ERROR_MARK)
4133         {
4134           *code = op.code;
4135           sign = TYPE_SIGN (op.type);
4136         }
4137       else if (op.code != *code)
4138         {
4139           fail = true;
4140           break;
4141         }
4142       else if ((op.code == MIN_EXPR
4143                 || op.code == MAX_EXPR)
4144                && sign != TYPE_SIGN (op.type))
4145         {
4146           fail = true;
4147           break;
4148         }
4149       /* Check there's only a single stmt the op is used on.  For the
4150          not value-changing tail and the last stmt allow out-of-loop uses.
4151          ???  We could relax this and handle arbitrary live stmts by
4152          forcing a scalar epilogue for example.  */
4153       imm_use_iterator imm_iter;
4154       use_operand_p use_p;
4155       gimple *op_use_stmt;
4156       unsigned cnt = 0;
4157       bool cond_fn_p = op.code.is_internal_fn ()
4158         && (conditional_internal_fn_code (internal_fn (op.code))
4159             != ERROR_MARK);
4160
4161       FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
4162         {
4163         /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
4164            op1 twice (once as definition, once as else) in the same operation.
4165            Allow this.  */
4166           if (cond_fn_p && op_use_stmt == use_stmt)
4167             {
4168               gcall *call = as_a<gcall *> (use_stmt);
4169               unsigned else_pos
4170                 = internal_fn_else_index (internal_fn (op.code));
4171
4172               for (unsigned int j = 0; j < gimple_call_num_args (call); ++j)
4173                 {
4174                   if (j == else_pos)
4175                     continue;
4176                   if (gimple_call_arg (call, j) == op.ops[opi])
4177                     cnt++;
4178                 }
4179             }
4180           else if (!is_gimple_debug (op_use_stmt)
4181                    && (*code != ERROR_MARK
4182                        || flow_bb_inside_loop_p (loop,
4183                                                  gimple_bb (op_use_stmt))))
4184             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4185               cnt++;
4186         }
4187
4188       if (cnt != 1)
4189         {
4190           fail = true;
4191           break;
4192         }
4193     }
4194   return ! fail && ! neg && *code != ERROR_MARK;
4195 }
4196
4197 bool
4198 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
4199                       tree loop_arg, enum tree_code code)
4200 {
4201   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4202   code_helper code_;
4203   return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
4204           && code_ == code);
4205 }
4206
4207
4208
4209 /* Function vect_is_simple_reduction
4210
4211    (1) Detect a cross-iteration def-use cycle that represents a simple
4212    reduction computation.  We look for the following pattern:
4213
4214    loop_header:
4215      a1 = phi < a0, a2 >
4216      a3 = ...
4217      a2 = operation (a3, a1)
4218
4219    or
4220
4221    a3 = ...
4222    loop_header:
4223      a1 = phi < a0, a2 >
4224      a2 = operation (a3, a1)
4225
4226    such that:
4227    1. operation is commutative and associative and it is safe to
4228       change the order of the computation
4229    2. no uses for a2 in the loop (a2 is used out of the loop)
4230    3. no uses of a1 in the loop besides the reduction operation
4231    4. no uses of a1 outside the loop.
4232
4233    Conditions 1,4 are tested here.
4234    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
4235
4236    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
4237    nested cycles.
4238
4239    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
4240    reductions:
4241
4242      a1 = phi < a0, a2 >
4243      inner loop (def of a3)
4244      a2 = phi < a3 >
4245
4246    (4) Detect condition expressions, ie:
4247      for (int i = 0; i < N; i++)
4248        if (a[i] < val)
4249         ret_val = a[i];
4250
4251 */
4252
4253 static stmt_vec_info
4254 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
4255                           bool *double_reduc, bool *reduc_chain_p, bool slp)
4256 {
4257   gphi *phi = as_a <gphi *> (phi_info->stmt);
4258   gimple *phi_use_stmt = NULL;
4259   imm_use_iterator imm_iter;
4260   use_operand_p use_p;
4261
4262   *double_reduc = false;
4263   *reduc_chain_p = false;
4264   STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
4265
4266   tree phi_name = PHI_RESULT (phi);
4267   /* ???  If there are no uses of the PHI result the inner loop reduction
4268      won't be detected as possibly double-reduction by vectorizable_reduction
4269      because that tries to walk the PHI arg from the preheader edge which
4270      can be constant.  See PR60382.  */
4271   if (has_zero_uses (phi_name))
4272     return NULL;
4273   class loop *loop = (gimple_bb (phi))->loop_father;
4274   unsigned nphi_def_loop_uses = 0;
4275   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
4276     {
4277       gimple *use_stmt = USE_STMT (use_p);
4278       if (is_gimple_debug (use_stmt))
4279         continue;
4280
4281       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4282         {
4283           if (dump_enabled_p ())
4284             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4285                              "intermediate value used outside loop.\n");
4286
4287           return NULL;
4288         }
4289
4290       /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
4291          op1 twice (once as definition, once as else) in the same operation.
4292          Only count it as one. */
4293       if (use_stmt != phi_use_stmt)
4294         {
4295           nphi_def_loop_uses++;
4296           phi_use_stmt = use_stmt;
4297         }
4298     }
4299
4300   tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
4301   if (TREE_CODE (latch_def) != SSA_NAME)
4302     {
4303       if (dump_enabled_p ())
4304         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4305                          "reduction: not ssa_name: %T\n", latch_def);
4306       return NULL;
4307     }
4308
4309   stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
4310   if (!def_stmt_info
4311       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
4312     return NULL;
4313
4314   bool nested_in_vect_loop
4315     = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
4316   unsigned nlatch_def_loop_uses = 0;
4317   auto_vec<gphi *, 3> lcphis;
4318   bool inner_loop_of_double_reduc = false;
4319   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
4320     {
4321       gimple *use_stmt = USE_STMT (use_p);
4322       if (is_gimple_debug (use_stmt))
4323         continue;
4324       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4325         nlatch_def_loop_uses++;
4326       else
4327         {
4328           /* We can have more than one loop-closed PHI.  */
4329           lcphis.safe_push (as_a <gphi *> (use_stmt));
4330           if (nested_in_vect_loop
4331               && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
4332                   == vect_double_reduction_def))
4333             inner_loop_of_double_reduc = true;
4334         }
4335     }
4336
4337   /* If we are vectorizing an inner reduction we are executing that
4338      in the original order only in case we are not dealing with a
4339      double reduction.  */
4340   if (nested_in_vect_loop && !inner_loop_of_double_reduc)
4341     {
4342       if (dump_enabled_p ())
4343         report_vect_op (MSG_NOTE, def_stmt_info->stmt,
4344                         "detected nested cycle: ");
4345       return def_stmt_info;
4346     }
4347
4348   /* When the inner loop of a double reduction ends up with more than
4349      one loop-closed PHI we have failed to classify alternate such
4350      PHIs as double reduction, leading to wrong code.  See PR103237.  */
4351   if (inner_loop_of_double_reduc && lcphis.length () != 1)
4352     {
4353       if (dump_enabled_p ())
4354         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4355                          "unhandle double reduction\n");
4356       return NULL;
4357     }
4358
4359   /* If this isn't a nested cycle or if the nested cycle reduction value
4360      is used ouside of the inner loop we cannot handle uses of the reduction
4361      value.  */
4362   if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
4363     {
4364       if (dump_enabled_p ())
4365         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4366                          "reduction used in loop.\n");
4367       return NULL;
4368     }
4369
4370   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
4371      defined in the inner loop.  */
4372   if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
4373     {
4374       tree op1 = PHI_ARG_DEF (def_stmt, 0);
4375       if (gimple_phi_num_args (def_stmt) != 1
4376           || TREE_CODE (op1) != SSA_NAME)
4377         {
4378           if (dump_enabled_p ())
4379             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4380                              "unsupported phi node definition.\n");
4381
4382           return NULL;
4383         }
4384
4385       /* Verify there is an inner cycle composed of the PHI phi_use_stmt
4386          and the latch definition op1.  */
4387       gimple *def1 = SSA_NAME_DEF_STMT (op1);
4388       if (gimple_bb (def1)
4389           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4390           && loop->inner
4391           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
4392           && (is_gimple_assign (def1) || is_gimple_call (def1))
4393           && is_a <gphi *> (phi_use_stmt)
4394           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt))
4395           && (op1 == PHI_ARG_DEF_FROM_EDGE (phi_use_stmt,
4396                                             loop_latch_edge (loop->inner))))
4397         {
4398           if (dump_enabled_p ())
4399             report_vect_op (MSG_NOTE, def_stmt,
4400                             "detected double reduction: ");
4401
4402           *double_reduc = true;
4403           return def_stmt_info;
4404         }
4405
4406       return NULL;
4407     }
4408
4409   /* Look for the expression computing latch_def from then loop PHI result.  */
4410   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4411   code_helper code;
4412   if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
4413                             path))
4414     {
4415       STMT_VINFO_REDUC_CODE (phi_info) = code;
4416       if (code == COND_EXPR && !nested_in_vect_loop)
4417         STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
4418
4419       /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
4420          reduction chain for which the additional restriction is that
4421          all operations in the chain are the same.  */
4422       auto_vec<stmt_vec_info, 8> reduc_chain;
4423       unsigned i;
4424       bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
4425       for (i = path.length () - 1; i >= 1; --i)
4426         {
4427           gimple *stmt = USE_STMT (path[i].second);
4428           stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
4429           gimple_match_op op;
4430           if (!gimple_extract_op (stmt, &op))
4431             gcc_unreachable ();
4432           if (gassign *assign = dyn_cast<gassign *> (stmt))
4433             STMT_VINFO_REDUC_IDX (stmt_info)
4434               = path[i].second->use - gimple_assign_rhs1_ptr (assign);
4435           else
4436             {
4437               gcall *call = as_a<gcall *> (stmt);
4438               STMT_VINFO_REDUC_IDX (stmt_info)
4439                 = path[i].second->use - gimple_call_arg_ptr (call, 0);
4440             }
4441           bool leading_conversion = (CONVERT_EXPR_CODE_P (op.code)
4442                                      && (i == 1 || i == path.length () - 1));
4443           if ((op.code != code && !leading_conversion)
4444               /* We can only handle the final value in epilogue
4445                  generation for reduction chains.  */
4446               || (i != 1 && !has_single_use (gimple_get_lhs (stmt))))
4447             is_slp_reduc = false;
4448           /* For reduction chains we support a trailing/leading
4449              conversions.  We do not store those in the actual chain.  */
4450           if (leading_conversion)
4451             continue;
4452           reduc_chain.safe_push (stmt_info);
4453         }
4454       if (slp && is_slp_reduc && reduc_chain.length () > 1)
4455         {
4456           for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
4457             {
4458               REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
4459               REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
4460             }
4461           REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
4462           REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
4463
4464           /* Save the chain for further analysis in SLP detection.  */
4465           LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
4466           REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
4467
4468           *reduc_chain_p = true;
4469           if (dump_enabled_p ())
4470             dump_printf_loc (MSG_NOTE, vect_location,
4471                             "reduction: detected reduction chain\n");
4472         }
4473       else if (dump_enabled_p ())
4474         dump_printf_loc (MSG_NOTE, vect_location,
4475                          "reduction: detected reduction\n");
4476
4477       return def_stmt_info;
4478     }
4479
4480   if (dump_enabled_p ())
4481     dump_printf_loc (MSG_NOTE, vect_location,
4482                      "reduction: unknown pattern\n");
4483
4484   return NULL;
4485 }
4486
4487 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
4488    PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
4489    or -1 if not known.  */
4490
4491 static int
4492 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
4493 {
4494   int assumed_vf = vect_vf_for_cost (loop_vinfo);
4495   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
4496     {
4497       if (dump_enabled_p ())
4498         dump_printf_loc (MSG_NOTE, vect_location,
4499                          "cost model: epilogue peel iters set to vf/2 "
4500                          "because loop iterations are unknown .\n");
4501       return assumed_vf / 2;
4502     }
4503   else
4504     {
4505       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
4506       peel_iters_prologue = MIN (niters, peel_iters_prologue);
4507       int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
4508       /* If we need to peel for gaps, but no peeling is required, we have to
4509          peel VF iterations.  */
4510       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
4511         peel_iters_epilogue = assumed_vf;
4512       return peel_iters_epilogue;
4513     }
4514 }
4515
4516 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
4517 int
4518 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
4519                              int *peel_iters_epilogue,
4520                              stmt_vector_for_cost *scalar_cost_vec,
4521                              stmt_vector_for_cost *prologue_cost_vec,
4522                              stmt_vector_for_cost *epilogue_cost_vec)
4523 {
4524   int retval = 0;
4525
4526   *peel_iters_epilogue
4527     = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
4528
4529   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
4530     {
4531       /* If peeled iterations are known but number of scalar loop
4532          iterations are unknown, count a taken branch per peeled loop.  */
4533       if (peel_iters_prologue > 0)
4534         retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
4535                                    vect_prologue);
4536       if (*peel_iters_epilogue > 0)
4537         retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
4538                                     vect_epilogue);
4539     }
4540
4541   stmt_info_for_cost *si;
4542   int j;
4543   if (peel_iters_prologue)
4544     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4545       retval += record_stmt_cost (prologue_cost_vec,
4546                                   si->count * peel_iters_prologue,
4547                                   si->kind, si->stmt_info, si->misalign,
4548                                   vect_prologue);
4549   if (*peel_iters_epilogue)
4550     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4551       retval += record_stmt_cost (epilogue_cost_vec,
4552                                   si->count * *peel_iters_epilogue,
4553                                   si->kind, si->stmt_info, si->misalign,
4554                                   vect_epilogue);
4555
4556   return retval;
4557 }
4558
4559 /* Function vect_estimate_min_profitable_iters
4560
4561    Return the number of iterations required for the vector version of the
4562    loop to be profitable relative to the cost of the scalar version of the
4563    loop.
4564
4565    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
4566    of iterations for vectorization.  -1 value means loop vectorization
4567    is not profitable.  This returned value may be used for dynamic
4568    profitability check.
4569
4570    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
4571    for static check against estimated number of iterations.  */
4572
4573 static void
4574 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
4575                                     int *ret_min_profitable_niters,
4576                                     int *ret_min_profitable_estimate,
4577                                     unsigned *suggested_unroll_factor)
4578 {
4579   int min_profitable_iters;
4580   int min_profitable_estimate;
4581   int peel_iters_prologue;
4582   int peel_iters_epilogue;
4583   unsigned vec_inside_cost = 0;
4584   int vec_outside_cost = 0;
4585   unsigned vec_prologue_cost = 0;
4586   unsigned vec_epilogue_cost = 0;
4587   int scalar_single_iter_cost = 0;
4588   int scalar_outside_cost = 0;
4589   int assumed_vf = vect_vf_for_cost (loop_vinfo);
4590   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
4591   vector_costs *target_cost_data = loop_vinfo->vector_costs;
4592
4593   /* Cost model disabled.  */
4594   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
4595     {
4596       if (dump_enabled_p ())
4597         dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
4598       *ret_min_profitable_niters = 0;
4599       *ret_min_profitable_estimate = 0;
4600       return;
4601     }
4602
4603   /* Requires loop versioning tests to handle misalignment.  */
4604   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
4605     {
4606       /*  FIXME: Make cost depend on complexity of individual check.  */
4607       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
4608       (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4609       if (dump_enabled_p ())
4610         dump_printf (MSG_NOTE,
4611                      "cost model: Adding cost of checks for loop "
4612                      "versioning to treat misalignment.\n");
4613     }
4614
4615   /* Requires loop versioning with alias checks.  */
4616   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4617     {
4618       /*  FIXME: Make cost depend on complexity of individual check.  */
4619       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
4620       (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4621       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
4622       if (len)
4623         /* Count LEN - 1 ANDs and LEN comparisons.  */
4624         (void) add_stmt_cost (target_cost_data, len * 2 - 1,
4625                               scalar_stmt, vect_prologue);
4626       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
4627       if (len)
4628         {
4629           /* Count LEN - 1 ANDs and LEN comparisons.  */
4630           unsigned int nstmts = len * 2 - 1;
4631           /* +1 for each bias that needs adding.  */
4632           for (unsigned int i = 0; i < len; ++i)
4633             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
4634               nstmts += 1;
4635           (void) add_stmt_cost (target_cost_data, nstmts,
4636                                 scalar_stmt, vect_prologue);
4637         }
4638       if (dump_enabled_p ())
4639         dump_printf (MSG_NOTE,
4640                      "cost model: Adding cost of checks for loop "
4641                      "versioning aliasing.\n");
4642     }
4643
4644   /* Requires loop versioning with niter checks.  */
4645   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
4646     {
4647       /*  FIXME: Make cost depend on complexity of individual check.  */
4648       (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
4649                             NULL, NULL, NULL_TREE, 0, vect_prologue);
4650       if (dump_enabled_p ())
4651         dump_printf (MSG_NOTE,
4652                      "cost model: Adding cost of checks for loop "
4653                      "versioning niters.\n");
4654     }
4655
4656   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4657     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4658                           vect_prologue);
4659
4660   /* Count statements in scalar loop.  Using this as scalar cost for a single
4661      iteration for now.
4662
4663      TODO: Add outer loop support.
4664
4665      TODO: Consider assigning different costs to different scalar
4666      statements.  */
4667
4668   scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost ();
4669
4670   /* Add additional cost for the peeled instructions in prologue and epilogue
4671      loop.  (For fully-masked loops there will be no peeling.)
4672
4673      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4674      at compile-time - we assume it's vf/2 (the worst would be vf-1).
4675
4676      TODO: Build an expression that represents peel_iters for prologue and
4677      epilogue to be used in a run-time test.  */
4678
4679   bool prologue_need_br_taken_cost = false;
4680   bool prologue_need_br_not_taken_cost = false;
4681
4682   /* Calculate peel_iters_prologue.  */
4683   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4684     peel_iters_prologue = 0;
4685   else if (npeel < 0)
4686     {
4687       peel_iters_prologue = assumed_vf / 2;
4688       if (dump_enabled_p ())
4689         dump_printf (MSG_NOTE, "cost model: "
4690                      "prologue peel iters set to vf/2.\n");
4691
4692       /* If peeled iterations are unknown, count a taken branch and a not taken
4693          branch per peeled loop.  Even if scalar loop iterations are known,
4694          vector iterations are not known since peeled prologue iterations are
4695          not known.  Hence guards remain the same.  */
4696       prologue_need_br_taken_cost = true;
4697       prologue_need_br_not_taken_cost = true;
4698     }
4699   else
4700     {
4701       peel_iters_prologue = npeel;
4702       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4703         /* If peeled iterations are known but number of scalar loop
4704            iterations are unknown, count a taken branch per peeled loop.  */
4705         prologue_need_br_taken_cost = true;
4706     }
4707
4708   bool epilogue_need_br_taken_cost = false;
4709   bool epilogue_need_br_not_taken_cost = false;
4710
4711   /* Calculate peel_iters_epilogue.  */
4712   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4713     /* We need to peel exactly one iteration for gaps.  */
4714     peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4715   else if (npeel < 0)
4716     {
4717       /* If peeling for alignment is unknown, loop bound of main loop
4718          becomes unknown.  */
4719       peel_iters_epilogue = assumed_vf / 2;
4720       if (dump_enabled_p ())
4721         dump_printf (MSG_NOTE, "cost model: "
4722                      "epilogue peel iters set to vf/2 because "
4723                      "peeling for alignment is unknown.\n");
4724
4725       /* See the same reason above in peel_iters_prologue calculation.  */
4726       epilogue_need_br_taken_cost = true;
4727       epilogue_need_br_not_taken_cost = true;
4728     }
4729   else
4730     {
4731       peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4732       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4733         /* If peeled iterations are known but number of scalar loop
4734            iterations are unknown, count a taken branch per peeled loop.  */
4735         epilogue_need_br_taken_cost = true;
4736     }
4737
4738   stmt_info_for_cost *si;
4739   int j;
4740   /* Add costs associated with peel_iters_prologue.  */
4741   if (peel_iters_prologue)
4742     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4743       {
4744         (void) add_stmt_cost (target_cost_data,
4745                               si->count * peel_iters_prologue, si->kind,
4746                               si->stmt_info, si->node, si->vectype,
4747                               si->misalign, vect_prologue);
4748       }
4749
4750   /* Add costs associated with peel_iters_epilogue.  */
4751   if (peel_iters_epilogue)
4752     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4753       {
4754         (void) add_stmt_cost (target_cost_data,
4755                               si->count * peel_iters_epilogue, si->kind,
4756                               si->stmt_info, si->node, si->vectype,
4757                               si->misalign, vect_epilogue);
4758       }
4759
4760   /* Add possible cond_branch_taken/cond_branch_not_taken cost.  */
4761
4762   if (prologue_need_br_taken_cost)
4763     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4764                           vect_prologue);
4765
4766   if (prologue_need_br_not_taken_cost)
4767     (void) add_stmt_cost (target_cost_data, 1,
4768                           cond_branch_not_taken, vect_prologue);
4769
4770   if (epilogue_need_br_taken_cost)
4771     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4772                           vect_epilogue);
4773
4774   if (epilogue_need_br_not_taken_cost)
4775     (void) add_stmt_cost (target_cost_data, 1,
4776                           cond_branch_not_taken, vect_epilogue);
4777
4778   /* Take care of special costs for rgroup controls of partial vectors.  */
4779   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4780       && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4781           == vect_partial_vectors_avx512))
4782     {
4783       /* Calculate how many masks we need to generate.  */
4784       unsigned int num_masks = 0;
4785       bool need_saturation = false;
4786       for (auto rgm : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
4787         if (rgm.type)
4788           {
4789             unsigned nvectors = rgm.factor;
4790             num_masks += nvectors;
4791             if (TYPE_PRECISION (TREE_TYPE (rgm.compare_type))
4792                 < TYPE_PRECISION (LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo)))
4793               need_saturation = true;
4794           }
4795
4796       /* ???  The target isn't able to identify the costs below as
4797          producing masks so it cannot penaltize cases where we'd run
4798          out of mask registers for example.  */
4799
4800       /* ???  We are also failing to account for smaller vector masks
4801          we generate by splitting larger masks in vect_get_loop_mask.  */
4802
4803       /* In the worst case, we need to generate each mask in the prologue
4804          and in the loop body.  We need one splat per group and one
4805          compare per mask.
4806
4807          Sometimes the prologue mask will fold to a constant,
4808          so the actual prologue cost might be smaller.  However, it's
4809          simpler and safer to use the worst-case cost; if this ends up
4810          being the tie-breaker between vectorizing or not, then it's
4811          probably better not to vectorize.  */
4812       (void) add_stmt_cost (target_cost_data,
4813                             num_masks
4814                             + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4815                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4816                             vect_prologue);
4817       (void) add_stmt_cost (target_cost_data,
4818                             num_masks
4819                             + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4820                             vector_stmt, NULL, NULL, NULL_TREE, 0, vect_body);
4821
4822       /* When we need saturation we need it both in the prologue and
4823          the epilogue.  */
4824       if (need_saturation)
4825         {
4826           (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4827                                 NULL, NULL, NULL_TREE, 0, vect_prologue);
4828           (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4829                                 NULL, NULL, NULL_TREE, 0, vect_body);
4830         }
4831     }
4832   else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4833            && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4834                == vect_partial_vectors_while_ult))
4835     {
4836       /* Calculate how many masks we need to generate.  */
4837       unsigned int num_masks = 0;
4838       rgroup_controls *rgm;
4839       unsigned int num_vectors_m1;
4840       FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec,
4841                         num_vectors_m1, rgm)
4842         if (rgm->type)
4843           num_masks += num_vectors_m1 + 1;
4844       gcc_assert (num_masks > 0);
4845
4846       /* In the worst case, we need to generate each mask in the prologue
4847          and in the loop body.  One of the loop body mask instructions
4848          replaces the comparison in the scalar loop, and since we don't
4849          count the scalar comparison against the scalar body, we shouldn't
4850          count that vector instruction against the vector body either.
4851
4852          Sometimes we can use unpacks instead of generating prologue
4853          masks and sometimes the prologue mask will fold to a constant,
4854          so the actual prologue cost might be smaller.  However, it's
4855          simpler and safer to use the worst-case cost; if this ends up
4856          being the tie-breaker between vectorizing or not, then it's
4857          probably better not to vectorize.  */
4858       (void) add_stmt_cost (target_cost_data, num_masks,
4859                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4860                             vect_prologue);
4861       (void) add_stmt_cost (target_cost_data, num_masks - 1,
4862                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4863                             vect_body);
4864     }
4865   else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4866     {
4867       /* Referring to the functions vect_set_loop_condition_partial_vectors
4868          and vect_set_loop_controls_directly, we need to generate each
4869          length in the prologue and in the loop body if required. Although
4870          there are some possible optimizations, we consider the worst case
4871          here.  */
4872
4873       bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4874       signed char partial_load_store_bias
4875         = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4876       bool need_iterate_p
4877         = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4878            && !vect_known_niters_smaller_than_vf (loop_vinfo));
4879
4880       /* Calculate how many statements to be added.  */
4881       unsigned int prologue_stmts = 0;
4882       unsigned int body_stmts = 0;
4883
4884       rgroup_controls *rgc;
4885       unsigned int num_vectors_m1;
4886       FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4887         if (rgc->type)
4888           {
4889             /* May need one SHIFT for nitems_total computation.  */
4890             unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4891             if (nitems != 1 && !niters_known_p)
4892               prologue_stmts += 1;
4893
4894             /* May need one MAX and one MINUS for wrap around.  */
4895             if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4896               prologue_stmts += 2;
4897
4898             /* Need one MAX and one MINUS for each batch limit excepting for
4899                the 1st one.  */
4900             prologue_stmts += num_vectors_m1 * 2;
4901
4902             unsigned int num_vectors = num_vectors_m1 + 1;
4903
4904             /* Need to set up lengths in prologue, only one MIN required
4905                for each since start index is zero.  */
4906             prologue_stmts += num_vectors;
4907
4908             /* If we have a non-zero partial load bias, we need one PLUS
4909                to adjust the load length.  */
4910             if (partial_load_store_bias != 0)
4911               body_stmts += 1;
4912
4913             unsigned int length_update_cost = 0;
4914             if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
4915               /* For decrement IV style, Each only need a single SELECT_VL
4916                  or MIN since beginning to calculate the number of elements
4917                  need to be processed in current iteration.  */
4918               length_update_cost = 1;
4919             else
4920               /* For increment IV stype, Each may need two MINs and one MINUS to
4921                  update lengths in body for next iteration.  */
4922               length_update_cost = 3;
4923
4924             if (need_iterate_p)
4925               body_stmts += length_update_cost * num_vectors;
4926           }
4927
4928       (void) add_stmt_cost (target_cost_data, prologue_stmts,
4929                             scalar_stmt, vect_prologue);
4930       (void) add_stmt_cost (target_cost_data, body_stmts,
4931                             scalar_stmt, vect_body);
4932     }
4933
4934   /* FORNOW: The scalar outside cost is incremented in one of the
4935      following ways:
4936
4937      1. The vectorizer checks for alignment and aliasing and generates
4938      a condition that allows dynamic vectorization.  A cost model
4939      check is ANDED with the versioning condition.  Hence scalar code
4940      path now has the added cost of the versioning check.
4941
4942        if (cost > th & versioning_check)
4943          jmp to vector code
4944
4945      Hence run-time scalar is incremented by not-taken branch cost.
4946
4947      2. The vectorizer then checks if a prologue is required.  If the
4948      cost model check was not done before during versioning, it has to
4949      be done before the prologue check.
4950
4951        if (cost <= th)
4952          prologue = scalar_iters
4953        if (prologue == 0)
4954          jmp to vector code
4955        else
4956          execute prologue
4957        if (prologue == num_iters)
4958          go to exit
4959
4960      Hence the run-time scalar cost is incremented by a taken branch,
4961      plus a not-taken branch, plus a taken branch cost.
4962
4963      3. The vectorizer then checks if an epilogue is required.  If the
4964      cost model check was not done before during prologue check, it
4965      has to be done with the epilogue check.
4966
4967        if (prologue == 0)
4968          jmp to vector code
4969        else
4970          execute prologue
4971        if (prologue == num_iters)
4972          go to exit
4973        vector code:
4974          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4975            jmp to epilogue
4976
4977      Hence the run-time scalar cost should be incremented by 2 taken
4978      branches.
4979
4980      TODO: The back end may reorder the BBS's differently and reverse
4981      conditions/branch directions.  Change the estimates below to
4982      something more reasonable.  */
4983
4984   /* If the number of iterations is known and we do not do versioning, we can
4985      decide whether to vectorize at compile time.  Hence the scalar version
4986      do not carry cost model guard costs.  */
4987   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4988       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4989     {
4990       /* Cost model check occurs at versioning.  */
4991       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4992         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4993       else
4994         {
4995           /* Cost model check occurs at prologue generation.  */
4996           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4997             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4998               + vect_get_stmt_cost (cond_branch_not_taken);
4999           /* Cost model check occurs at epilogue generation.  */
5000           else
5001             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
5002         }
5003     }
5004
5005   /* Complete the target-specific cost calculations.  */
5006   finish_cost (loop_vinfo->vector_costs, loop_vinfo->scalar_costs,
5007                &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost,
5008                suggested_unroll_factor);
5009
5010   if (suggested_unroll_factor && *suggested_unroll_factor > 1
5011       && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
5012       && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
5013                     *suggested_unroll_factor,
5014                     LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
5015     {
5016       if (dump_enabled_p ())
5017         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5018                          "can't unroll as unrolled vectorization factor larger"
5019                          " than maximum vectorization factor: "
5020                          HOST_WIDE_INT_PRINT_UNSIGNED "\n",
5021                          LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
5022       *suggested_unroll_factor = 1;
5023     }
5024
5025   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
5026
5027   if (dump_enabled_p ())
5028     {
5029       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
5030       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
5031                    vec_inside_cost);
5032       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
5033                    vec_prologue_cost);
5034       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
5035                    vec_epilogue_cost);
5036       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
5037                    scalar_single_iter_cost);
5038       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
5039                    scalar_outside_cost);
5040       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
5041                    vec_outside_cost);
5042       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
5043                    peel_iters_prologue);
5044       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
5045                    peel_iters_epilogue);
5046     }
5047
5048   /* Calculate number of iterations required to make the vector version
5049      profitable, relative to the loop bodies only.  The following condition
5050      must hold true:
5051      SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
5052      where
5053      SIC = scalar iteration cost, VIC = vector iteration cost,
5054      VOC = vector outside cost, VF = vectorization factor,
5055      NPEEL = prologue iterations + epilogue iterations,
5056      SOC = scalar outside cost for run time cost model check.  */
5057
5058   int saving_per_viter = (scalar_single_iter_cost * assumed_vf
5059                           - vec_inside_cost);
5060   if (saving_per_viter <= 0)
5061     {
5062       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
5063         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
5064                     "vectorization did not happen for a simd loop");
5065
5066       if (dump_enabled_p ())
5067         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5068                          "cost model: the vector iteration cost = %d "
5069                          "divided by the scalar iteration cost = %d "
5070                          "is greater or equal to the vectorization factor = %d"
5071                          ".\n",
5072                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
5073       *ret_min_profitable_niters = -1;
5074       *ret_min_profitable_estimate = -1;
5075       return;
5076     }
5077
5078   /* ??? The "if" arm is written to handle all cases; see below for what
5079      we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
5080   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5081     {
5082       /* Rewriting the condition above in terms of the number of
5083          vector iterations (vniters) rather than the number of
5084          scalar iterations (niters) gives:
5085
5086          SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
5087
5088          <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
5089
5090          For integer N, X and Y when X > 0:
5091
5092          N * X > Y <==> N >= (Y /[floor] X) + 1.  */
5093       int outside_overhead = (vec_outside_cost
5094                               - scalar_single_iter_cost * peel_iters_prologue
5095                               - scalar_single_iter_cost * peel_iters_epilogue
5096                               - scalar_outside_cost);
5097       /* We're only interested in cases that require at least one
5098          vector iteration.  */
5099       int min_vec_niters = 1;
5100       if (outside_overhead > 0)
5101         min_vec_niters = outside_overhead / saving_per_viter + 1;
5102
5103       if (dump_enabled_p ())
5104         dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
5105                      min_vec_niters);
5106
5107       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5108         {
5109           /* Now that we know the minimum number of vector iterations,
5110              find the minimum niters for which the scalar cost is larger:
5111
5112              SIC * niters > VIC * vniters + VOC - SOC
5113
5114              We know that the minimum niters is no more than
5115              vniters * VF + NPEEL, but it might be (and often is) less
5116              than that if a partial vector iteration is cheaper than the
5117              equivalent scalar code.  */
5118           int threshold = (vec_inside_cost * min_vec_niters
5119                            + vec_outside_cost
5120                            - scalar_outside_cost);
5121           if (threshold <= 0)
5122             min_profitable_iters = 1;
5123           else
5124             min_profitable_iters = threshold / scalar_single_iter_cost + 1;
5125         }
5126       else
5127         /* Convert the number of vector iterations into a number of
5128            scalar iterations.  */
5129         min_profitable_iters = (min_vec_niters * assumed_vf
5130                                 + peel_iters_prologue
5131                                 + peel_iters_epilogue);
5132     }
5133   else
5134     {
5135       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
5136                               * assumed_vf
5137                               - vec_inside_cost * peel_iters_prologue
5138                               - vec_inside_cost * peel_iters_epilogue);
5139       if (min_profitable_iters <= 0)
5140         min_profitable_iters = 0;
5141       else
5142         {
5143           min_profitable_iters /= saving_per_viter;
5144
5145           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
5146               <= (((int) vec_inside_cost * min_profitable_iters)
5147                   + (((int) vec_outside_cost - scalar_outside_cost)
5148                      * assumed_vf)))
5149             min_profitable_iters++;
5150         }
5151     }
5152
5153   if (dump_enabled_p ())
5154     dump_printf (MSG_NOTE,
5155                  "  Calculated minimum iters for profitability: %d\n",
5156                  min_profitable_iters);
5157
5158   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
5159       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
5160     /* We want the vectorized loop to execute at least once.  */
5161     min_profitable_iters = assumed_vf + peel_iters_prologue;
5162   else if (min_profitable_iters < peel_iters_prologue)
5163     /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
5164        vectorized loop executes at least once.  */
5165     min_profitable_iters = peel_iters_prologue;
5166
5167   if (dump_enabled_p ())
5168     dump_printf_loc (MSG_NOTE, vect_location,
5169                      "  Runtime profitability threshold = %d\n",
5170                      min_profitable_iters);
5171
5172   *ret_min_profitable_niters = min_profitable_iters;
5173
5174   /* Calculate number of iterations required to make the vector version
5175      profitable, relative to the loop bodies only.
5176
5177      Non-vectorized variant is SIC * niters and it must win over vector
5178      variant on the expected loop trip count.  The following condition must hold true:
5179      SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
5180
5181   if (vec_outside_cost <= 0)
5182     min_profitable_estimate = 0;
5183   /* ??? This "else if" arm is written to handle all cases; see below for
5184      what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
5185   else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5186     {
5187       /* This is a repeat of the code above, but with + SOC rather
5188          than - SOC.  */
5189       int outside_overhead = (vec_outside_cost
5190                               - scalar_single_iter_cost * peel_iters_prologue
5191                               - scalar_single_iter_cost * peel_iters_epilogue
5192                               + scalar_outside_cost);
5193       int min_vec_niters = 1;
5194       if (outside_overhead > 0)
5195         min_vec_niters = outside_overhead / saving_per_viter + 1;
5196
5197       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5198         {
5199           int threshold = (vec_inside_cost * min_vec_niters
5200                            + vec_outside_cost
5201                            + scalar_outside_cost);
5202           min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
5203         }
5204       else
5205         min_profitable_estimate = (min_vec_niters * assumed_vf
5206                                    + peel_iters_prologue
5207                                    + peel_iters_epilogue);
5208     }
5209   else
5210     {
5211       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
5212                                  * assumed_vf
5213                                  - vec_inside_cost * peel_iters_prologue
5214                                  - vec_inside_cost * peel_iters_epilogue)
5215                                  / ((scalar_single_iter_cost * assumed_vf)
5216                                    - vec_inside_cost);
5217     }
5218   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
5219   if (dump_enabled_p ())
5220     dump_printf_loc (MSG_NOTE, vect_location,
5221                      "  Static estimate profitability threshold = %d\n",
5222                      min_profitable_estimate);
5223
5224   *ret_min_profitable_estimate = min_profitable_estimate;
5225 }
5226
5227 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
5228    vector elements (not bits) for a vector with NELT elements.  */
5229 static void
5230 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
5231                               vec_perm_builder *sel)
5232 {
5233   /* The encoding is a single stepped pattern.  Any wrap-around is handled
5234      by vec_perm_indices.  */
5235   sel->new_vector (nelt, 1, 3);
5236   for (unsigned int i = 0; i < 3; i++)
5237     sel->quick_push (i + offset);
5238 }
5239
5240 /* Checks whether the target supports whole-vector shifts for vectors of mode
5241    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
5242    it supports vec_perm_const with masks for all necessary shift amounts.  */
5243 static bool
5244 have_whole_vector_shift (machine_mode mode)
5245 {
5246   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
5247     return true;
5248
5249   /* Variable-length vectors should be handled via the optab.  */
5250   unsigned int nelt;
5251   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5252     return false;
5253
5254   vec_perm_builder sel;
5255   vec_perm_indices indices;
5256   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
5257     {
5258       calc_vec_perm_mask_for_shift (i, nelt, &sel);
5259       indices.new_vector (sel, 2, nelt);
5260       if (!can_vec_perm_const_p (mode, mode, indices, false))
5261         return false;
5262     }
5263   return true;
5264 }
5265
5266 /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
5267    multiplication operands have differing signs and (b) we intend
5268    to emulate the operation using a series of signed DOT_PROD_EXPRs.
5269    See vect_emulate_mixed_dot_prod for the actual sequence used.  */
5270
5271 static bool
5272 vect_is_emulated_mixed_dot_prod (loop_vec_info loop_vinfo,
5273                                  stmt_vec_info stmt_info)
5274 {
5275   gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
5276   if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
5277     return false;
5278
5279   tree rhs1 = gimple_assign_rhs1 (assign);
5280   tree rhs2 = gimple_assign_rhs2 (assign);
5281   if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
5282     return false;
5283
5284   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5285   gcc_assert (reduc_info->is_reduc_info);
5286   return !directly_supported_p (DOT_PROD_EXPR,
5287                                 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info),
5288                                 optab_vector_mixed_sign);
5289 }
5290
5291 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
5292    functions. Design better to avoid maintenance issues.  */
5293
5294 /* Function vect_model_reduction_cost.
5295
5296    Models cost for a reduction operation, including the vector ops
5297    generated within the strip-mine loop in some cases, the initial
5298    definition before the loop, and the epilogue code that must be generated.  */
5299
5300 static void
5301 vect_model_reduction_cost (loop_vec_info loop_vinfo,
5302                            stmt_vec_info stmt_info, internal_fn reduc_fn,
5303                            vect_reduction_type reduction_type,
5304                            int ncopies, stmt_vector_for_cost *cost_vec)
5305 {
5306   int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
5307   tree vectype;
5308   machine_mode mode;
5309   class loop *loop = NULL;
5310
5311   if (loop_vinfo)
5312     loop = LOOP_VINFO_LOOP (loop_vinfo);
5313
5314   /* Condition reductions generate two reductions in the loop.  */
5315   if (reduction_type == COND_REDUCTION)
5316     ncopies *= 2;
5317
5318   vectype = STMT_VINFO_VECTYPE (stmt_info);
5319   mode = TYPE_MODE (vectype);
5320   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5321
5322   gimple_match_op op;
5323   if (!gimple_extract_op (orig_stmt_info->stmt, &op))
5324     gcc_unreachable ();
5325
5326   bool emulated_mixed_dot_prod
5327     = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
5328   if (reduction_type == EXTRACT_LAST_REDUCTION)
5329     /* No extra instructions are needed in the prologue.  The loop body
5330        operations are costed in vectorizable_condition.  */
5331     inside_cost = 0;
5332   else if (reduction_type == FOLD_LEFT_REDUCTION)
5333     {
5334       /* No extra instructions needed in the prologue.  */
5335       prologue_cost = 0;
5336
5337       if (reduc_fn != IFN_LAST)
5338         /* Count one reduction-like operation per vector.  */
5339         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
5340                                         stmt_info, 0, vect_body);
5341       else
5342         {
5343           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
5344           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
5345           inside_cost = record_stmt_cost (cost_vec, nelements,
5346                                           vec_to_scalar, stmt_info, 0,
5347                                           vect_body);
5348           inside_cost += record_stmt_cost (cost_vec, nelements,
5349                                            scalar_stmt, stmt_info, 0,
5350                                            vect_body);
5351         }
5352     }
5353   else
5354     {
5355       /* Add in the cost of the initial definitions.  */
5356       int prologue_stmts;
5357       if (reduction_type == COND_REDUCTION)
5358         /* For cond reductions we have four vectors: initial index, step,
5359            initial result of the data reduction, initial value of the index
5360            reduction.  */
5361         prologue_stmts = 4;
5362       else if (emulated_mixed_dot_prod)
5363         /* We need the initial reduction value and two invariants:
5364            one that contains the minimum signed value and one that
5365            contains half of its negative.  */
5366         prologue_stmts = 3;
5367       else
5368         prologue_stmts = 1;
5369       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
5370                                          scalar_to_vec, stmt_info, 0,
5371                                          vect_prologue);
5372     }
5373
5374   /* Determine cost of epilogue code.
5375
5376      We have a reduction operator that will reduce the vector in one statement.
5377      Also requires scalar extract.  */
5378
5379   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
5380     {
5381       if (reduc_fn != IFN_LAST)
5382         {
5383           if (reduction_type == COND_REDUCTION)
5384             {
5385               /* An EQ stmt and an COND_EXPR stmt.  */
5386               epilogue_cost += record_stmt_cost (cost_vec, 2,
5387                                                  vector_stmt, stmt_info, 0,
5388                                                  vect_epilogue);
5389               /* Reduction of the max index and a reduction of the found
5390                  values.  */
5391               epilogue_cost += record_stmt_cost (cost_vec, 2,
5392                                                  vec_to_scalar, stmt_info, 0,
5393                                                  vect_epilogue);
5394               /* A broadcast of the max value.  */
5395               epilogue_cost += record_stmt_cost (cost_vec, 1,
5396                                                  scalar_to_vec, stmt_info, 0,
5397                                                  vect_epilogue);
5398             }
5399           else
5400             {
5401               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
5402                                                  stmt_info, 0, vect_epilogue);
5403               epilogue_cost += record_stmt_cost (cost_vec, 1,
5404                                                  vec_to_scalar, stmt_info, 0,
5405                                                  vect_epilogue);
5406             }
5407         }
5408       else if (reduction_type == COND_REDUCTION)
5409         {
5410           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
5411           /* Extraction of scalar elements.  */
5412           epilogue_cost += record_stmt_cost (cost_vec,
5413                                              2 * estimated_nunits,
5414                                              vec_to_scalar, stmt_info, 0,
5415                                              vect_epilogue);
5416           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
5417           epilogue_cost += record_stmt_cost (cost_vec,
5418                                              2 * estimated_nunits - 3,
5419                                              scalar_stmt, stmt_info, 0,
5420                                              vect_epilogue);
5421         }
5422       else if (reduction_type == EXTRACT_LAST_REDUCTION
5423                || reduction_type == FOLD_LEFT_REDUCTION)
5424         /* No extra instructions need in the epilogue.  */
5425         ;
5426       else
5427         {
5428           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5429           tree bitsize = TYPE_SIZE (op.type);
5430           int element_bitsize = tree_to_uhwi (bitsize);
5431           int nelements = vec_size_in_bits / element_bitsize;
5432
5433           if (op.code == COND_EXPR)
5434             op.code = MAX_EXPR;
5435
5436           /* We have a whole vector shift available.  */
5437           if (VECTOR_MODE_P (mode)
5438               && directly_supported_p (op.code, vectype)
5439               && have_whole_vector_shift (mode))
5440             {
5441               /* Final reduction via vector shifts and the reduction operator.
5442                  Also requires scalar extract.  */
5443               epilogue_cost += record_stmt_cost (cost_vec,
5444                                                  exact_log2 (nelements) * 2,
5445                                                  vector_stmt, stmt_info, 0,
5446                                                  vect_epilogue);
5447               epilogue_cost += record_stmt_cost (cost_vec, 1,
5448                                                  vec_to_scalar, stmt_info, 0,
5449                                                  vect_epilogue);
5450             }
5451           else
5452             /* Use extracts and reduction op for final reduction.  For N
5453                elements, we have N extracts and N-1 reduction ops.  */
5454             epilogue_cost += record_stmt_cost (cost_vec,
5455                                                nelements + nelements - 1,
5456                                                vector_stmt, stmt_info, 0,
5457                                                vect_epilogue);
5458         }
5459     }
5460
5461   if (dump_enabled_p ())
5462     dump_printf (MSG_NOTE,
5463                  "vect_model_reduction_cost: inside_cost = %d, "
5464                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
5465                  prologue_cost, epilogue_cost);
5466 }
5467
5468 /* SEQ is a sequence of instructions that initialize the reduction
5469    described by REDUC_INFO.  Emit them in the appropriate place.  */
5470
5471 static void
5472 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
5473                                 stmt_vec_info reduc_info, gimple *seq)
5474 {
5475   if (reduc_info->reused_accumulator)
5476     {
5477       /* When reusing an accumulator from the main loop, we only need
5478          initialization instructions if the main loop can be skipped.
5479          In that case, emit the initialization instructions at the end
5480          of the guard block that does the skip.  */
5481       edge skip_edge = loop_vinfo->skip_main_loop_edge;
5482       gcc_assert (skip_edge);
5483       gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
5484       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
5485     }
5486   else
5487     {
5488       /* The normal case: emit the initialization instructions on the
5489          preheader edge.  */
5490       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5491       gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
5492     }
5493 }
5494
5495 /* Function get_initial_def_for_reduction
5496
5497    Input:
5498    REDUC_INFO - the info_for_reduction
5499    INIT_VAL - the initial value of the reduction variable
5500    NEUTRAL_OP - a value that has no effect on the reduction, as per
5501                 neutral_op_for_reduction
5502
5503    Output:
5504    Return a vector variable, initialized according to the operation that
5505         STMT_VINFO performs. This vector will be used as the initial value
5506         of the vector of partial results.
5507
5508    The value we need is a vector in which element 0 has value INIT_VAL
5509    and every other element has value NEUTRAL_OP.  */
5510
5511 static tree
5512 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
5513                                stmt_vec_info reduc_info,
5514                                tree init_val, tree neutral_op)
5515 {
5516   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5517   tree scalar_type = TREE_TYPE (init_val);
5518   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
5519   tree init_def;
5520   gimple_seq stmts = NULL;
5521
5522   gcc_assert (vectype);
5523
5524   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
5525               || SCALAR_FLOAT_TYPE_P (scalar_type));
5526
5527   gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
5528               || loop == (gimple_bb (reduc_info->stmt))->loop_father);
5529
5530   if (operand_equal_p (init_val, neutral_op))
5531     {
5532       /* If both elements are equal then the vector described above is
5533          just a splat.  */
5534       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5535       init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
5536     }
5537   else
5538     {
5539       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5540       init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
5541       if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
5542         {
5543           /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
5544              element 0.  */
5545           init_def = gimple_build_vector_from_val (&stmts, vectype,
5546                                                    neutral_op);
5547           init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
5548                                    vectype, init_def, init_val);
5549         }
5550       else
5551         {
5552           /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}.  */
5553           tree_vector_builder elts (vectype, 1, 2);
5554           elts.quick_push (init_val);
5555           elts.quick_push (neutral_op);
5556           init_def = gimple_build_vector (&stmts, &elts);
5557         }
5558     }
5559
5560   if (stmts)
5561     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
5562   return init_def;
5563 }
5564
5565 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
5566    which performs a reduction involving GROUP_SIZE scalar statements.
5567    NUMBER_OF_VECTORS is the number of vector defs to create.  If NEUTRAL_OP
5568    is nonnull, introducing extra elements of that value will not change the
5569    result.  */
5570
5571 static void
5572 get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
5573                                 stmt_vec_info reduc_info,
5574                                 vec<tree> *vec_oprnds,
5575                                 unsigned int number_of_vectors,
5576                                 unsigned int group_size, tree neutral_op)
5577 {
5578   vec<tree> &initial_values = reduc_info->reduc_initial_values;
5579   unsigned HOST_WIDE_INT nunits;
5580   unsigned j, number_of_places_left_in_vector;
5581   tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
5582   unsigned int i;
5583
5584   gcc_assert (group_size == initial_values.length () || neutral_op);
5585
5586   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5587      created vectors. It is greater than 1 if unrolling is performed.
5588
5589      For example, we have two scalar operands, s1 and s2 (e.g., group of
5590      strided accesses of size two), while NUNITS is four (i.e., four scalars
5591      of this type can be packed in a vector).  The output vector will contain
5592      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
5593      will be 2).
5594
5595      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
5596      vectors containing the operands.
5597
5598      For example, NUNITS is four as before, and the group size is 8
5599      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
5600      {s5, s6, s7, s8}.  */
5601
5602   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
5603     nunits = group_size;
5604
5605   number_of_places_left_in_vector = nunits;
5606   bool constant_p = true;
5607   tree_vector_builder elts (vector_type, nunits, 1);
5608   elts.quick_grow (nunits);
5609   gimple_seq ctor_seq = NULL;
5610   for (j = 0; j < nunits * number_of_vectors; ++j)
5611     {
5612       tree op;
5613       i = j % group_size;
5614
5615       /* Get the def before the loop.  In reduction chain we have only
5616          one initial value.  Else we have as many as PHIs in the group.  */
5617       if (i >= initial_values.length () || (j > i && neutral_op))
5618         op = neutral_op;
5619       else
5620         op = initial_values[i];
5621
5622       /* Create 'vect_ = {op0,op1,...,opn}'.  */
5623       number_of_places_left_in_vector--;
5624       elts[nunits - number_of_places_left_in_vector - 1] = op;
5625       if (!CONSTANT_CLASS_P (op))
5626         constant_p = false;
5627
5628       if (number_of_places_left_in_vector == 0)
5629         {
5630           tree init;
5631           if (constant_p && !neutral_op
5632               ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
5633               : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
5634             /* Build the vector directly from ELTS.  */
5635             init = gimple_build_vector (&ctor_seq, &elts);
5636           else if (neutral_op)
5637             {
5638               /* Build a vector of the neutral value and shift the
5639                  other elements into place.  */
5640               init = gimple_build_vector_from_val (&ctor_seq, vector_type,
5641                                                    neutral_op);
5642               int k = nunits;
5643               while (k > 0 && elts[k - 1] == neutral_op)
5644                 k -= 1;
5645               while (k > 0)
5646                 {
5647                   k -= 1;
5648                   init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
5649                                        vector_type, init, elts[k]);
5650                 }
5651             }
5652           else
5653             {
5654               /* First time round, duplicate ELTS to fill the
5655                  required number of vectors.  */
5656               duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
5657                                         elts, number_of_vectors, *vec_oprnds);
5658               break;
5659             }
5660           vec_oprnds->quick_push (init);
5661
5662           number_of_places_left_in_vector = nunits;
5663           elts.new_vector (vector_type, nunits, 1);
5664           elts.quick_grow (nunits);
5665           constant_p = true;
5666         }
5667     }
5668   if (ctor_seq != NULL)
5669     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
5670 }
5671
5672 /* For a statement STMT_INFO taking part in a reduction operation return
5673    the stmt_vec_info the meta information is stored on.  */
5674
5675 stmt_vec_info
5676 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
5677 {
5678   stmt_info = vect_orig_stmt (stmt_info);
5679   gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
5680   if (!is_a <gphi *> (stmt_info->stmt)
5681       || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
5682     stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5683   gphi *phi = as_a <gphi *> (stmt_info->stmt);
5684   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5685     {
5686       if (gimple_phi_num_args (phi) == 1)
5687         stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5688     }
5689   else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
5690     {
5691       stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
5692       if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
5693         stmt_info = info;
5694     }
5695   return stmt_info;
5696 }
5697
5698 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
5699    REDUC_INFO can build on.  Adjust REDUC_INFO and return true if so, otherwise
5700    return false.  */
5701
5702 static bool
5703 vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
5704                                 stmt_vec_info reduc_info)
5705 {
5706   loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
5707   if (!main_loop_vinfo)
5708     return false;
5709
5710   if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
5711     return false;
5712
5713   unsigned int num_phis = reduc_info->reduc_initial_values.length ();
5714   auto_vec<tree, 16> main_loop_results (num_phis);
5715   auto_vec<tree, 16> initial_values (num_phis);
5716   if (edge main_loop_edge = loop_vinfo->main_loop_edge)
5717     {
5718       /* The epilogue loop can be entered either from the main loop or
5719          from an earlier guard block.  */
5720       edge skip_edge = loop_vinfo->skip_main_loop_edge;
5721       for (tree incoming_value : reduc_info->reduc_initial_values)
5722         {
5723           /* Look for:
5724
5725                INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
5726                                     INITIAL_VALUE(guard block)>.  */
5727           gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
5728
5729           gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
5730           gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
5731
5732           tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
5733           tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
5734
5735           main_loop_results.quick_push (from_main_loop);
5736           initial_values.quick_push (from_skip);
5737         }
5738     }
5739   else
5740     /* The main loop dominates the epilogue loop.  */
5741     main_loop_results.splice (reduc_info->reduc_initial_values);
5742
5743   /* See if the main loop has the kind of accumulator we need.  */
5744   vect_reusable_accumulator *accumulator
5745     = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
5746   if (!accumulator
5747       || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
5748       || !std::equal (main_loop_results.begin (), main_loop_results.end (),
5749                       accumulator->reduc_info->reduc_scalar_results.begin ()))
5750     return false;
5751
5752   /* Handle the case where we can reduce wider vectors to narrower ones.  */
5753   tree vectype = STMT_VINFO_VECTYPE (reduc_info);
5754   tree old_vectype = TREE_TYPE (accumulator->reduc_input);
5755   unsigned HOST_WIDE_INT m;
5756   if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
5757                             TYPE_VECTOR_SUBPARTS (vectype), &m))
5758     return false;
5759   /* Check the intermediate vector types and operations are available.  */
5760   tree prev_vectype = old_vectype;
5761   poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
5762   while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
5763     {
5764       intermediate_nunits = exact_div (intermediate_nunits, 2);
5765       tree intermediate_vectype = get_related_vectype_for_scalar_type
5766         (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
5767       if (!intermediate_vectype
5768           || !directly_supported_p (STMT_VINFO_REDUC_CODE (reduc_info),
5769                                     intermediate_vectype)
5770           || !can_vec_extract (TYPE_MODE (prev_vectype),
5771                                TYPE_MODE (intermediate_vectype)))
5772         return false;
5773       prev_vectype = intermediate_vectype;
5774     }
5775
5776   /* Non-SLP reductions might apply an adjustment after the reduction
5777      operation, in order to simplify the initialization of the accumulator.
5778      If the epilogue loop carries on from where the main loop left off,
5779      it should apply the same adjustment to the final reduction result.
5780
5781      If the epilogue loop can also be entered directly (rather than via
5782      the main loop), we need to be able to handle that case in the same way,
5783      with the same adjustment.  (In principle we could add a PHI node
5784      to select the correct adjustment, but in practice that shouldn't be
5785      necessary.)  */
5786   tree main_adjustment
5787     = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5788   if (loop_vinfo->main_loop_edge && main_adjustment)
5789     {
5790       gcc_assert (num_phis == 1);
5791       tree initial_value = initial_values[0];
5792       /* Check that we can use INITIAL_VALUE as the adjustment and
5793          initialize the accumulator with a neutral value instead.  */
5794       if (!operand_equal_p (initial_value, main_adjustment))
5795         return false;
5796       code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5797       initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
5798                                                     code, initial_value);
5799     }
5800   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5801   reduc_info->reduc_initial_values.truncate (0);
5802   reduc_info->reduc_initial_values.splice (initial_values);
5803   reduc_info->reused_accumulator = accumulator;
5804   return true;
5805 }
5806
5807 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5808    CODE emitting stmts before GSI.  Returns a vector def of VECTYPE.  */
5809
5810 static tree
5811 vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5812                             gimple_seq *seq)
5813 {
5814   unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5815   unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5816   tree stype = TREE_TYPE (vectype);
5817   tree new_temp = vec_def;
5818   while (nunits > nunits1)
5819     {
5820       nunits /= 2;
5821       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5822                                                            stype, nunits);
5823       unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5824
5825       /* The target has to make sure we support lowpart/highpart
5826          extraction, either via direct vector extract or through
5827          an integer mode punning.  */
5828       tree dst1, dst2;
5829       gimple *epilog_stmt;
5830       if (convert_optab_handler (vec_extract_optab,
5831                                  TYPE_MODE (TREE_TYPE (new_temp)),
5832                                  TYPE_MODE (vectype1))
5833           != CODE_FOR_nothing)
5834         {
5835           /* Extract sub-vectors directly once vec_extract becomes
5836              a conversion optab.  */
5837           dst1 = make_ssa_name (vectype1);
5838           epilog_stmt
5839               = gimple_build_assign (dst1, BIT_FIELD_REF,
5840                                      build3 (BIT_FIELD_REF, vectype1,
5841                                              new_temp, TYPE_SIZE (vectype1),
5842                                              bitsize_int (0)));
5843           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5844           dst2 =  make_ssa_name (vectype1);
5845           epilog_stmt
5846               = gimple_build_assign (dst2, BIT_FIELD_REF,
5847                                      build3 (BIT_FIELD_REF, vectype1,
5848                                              new_temp, TYPE_SIZE (vectype1),
5849                                              bitsize_int (bitsize)));
5850           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5851         }
5852       else
5853         {
5854           /* Extract via punning to appropriately sized integer mode
5855              vector.  */
5856           tree eltype = build_nonstandard_integer_type (bitsize, 1);
5857           tree etype = build_vector_type (eltype, 2);
5858           gcc_assert (convert_optab_handler (vec_extract_optab,
5859                                              TYPE_MODE (etype),
5860                                              TYPE_MODE (eltype))
5861                       != CODE_FOR_nothing);
5862           tree tem = make_ssa_name (etype);
5863           epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5864                                              build1 (VIEW_CONVERT_EXPR,
5865                                                      etype, new_temp));
5866           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5867           new_temp = tem;
5868           tem = make_ssa_name (eltype);
5869           epilog_stmt
5870               = gimple_build_assign (tem, BIT_FIELD_REF,
5871                                      build3 (BIT_FIELD_REF, eltype,
5872                                              new_temp, TYPE_SIZE (eltype),
5873                                              bitsize_int (0)));
5874           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5875           dst1 = make_ssa_name (vectype1);
5876           epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5877                                              build1 (VIEW_CONVERT_EXPR,
5878                                                      vectype1, tem));
5879           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5880           tem = make_ssa_name (eltype);
5881           epilog_stmt
5882               = gimple_build_assign (tem, BIT_FIELD_REF,
5883                                      build3 (BIT_FIELD_REF, eltype,
5884                                              new_temp, TYPE_SIZE (eltype),
5885                                              bitsize_int (bitsize)));
5886           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5887           dst2 =  make_ssa_name (vectype1);
5888           epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5889                                              build1 (VIEW_CONVERT_EXPR,
5890                                                      vectype1, tem));
5891           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5892         }
5893
5894       new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5895     }
5896
5897   return new_temp;
5898 }
5899
5900 /* Retrieves the definining statement to be used for a reduction.
5901    For LAST_VAL_REDUC_P we use the current VEC_STMTs which correspond to the
5902    final value after vectorization and otherwise we look at the reduction
5903    definitions to get the first.  */
5904
5905 tree
5906 vect_get_vect_def (stmt_vec_info reduc_info, slp_tree slp_node,
5907                    slp_instance slp_node_instance, bool last_val_reduc_p,
5908                    unsigned i, vec <gimple *> &vec_stmts)
5909 {
5910   tree def;
5911
5912   if (slp_node)
5913     {
5914       if (!last_val_reduc_p)
5915         slp_node = slp_node_instance->reduc_phis;
5916       def = vect_get_slp_vect_def (slp_node, i);
5917     }
5918   else
5919     {
5920       if (!last_val_reduc_p)
5921         reduc_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (reduc_info));
5922       vec_stmts = STMT_VINFO_VEC_STMTS (reduc_info);
5923       def = gimple_get_lhs (vec_stmts[0]);
5924     }
5925
5926   return def;
5927 }
5928
5929 /* Function vect_create_epilog_for_reduction
5930
5931    Create code at the loop-epilog to finalize the result of a reduction
5932    computation.
5933
5934    STMT_INFO is the scalar reduction stmt that is being vectorized.
5935    SLP_NODE is an SLP node containing a group of reduction statements. The
5936      first one in this group is STMT_INFO.
5937    SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5938    REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5939      (counting from 0)
5940    LOOP_EXIT is the edge to update in the merge block.  In the case of a single
5941      exit this edge is always the main loop exit.
5942
5943    This function:
5944    1. Completes the reduction def-use cycles.
5945    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5946       by calling the function specified by REDUC_FN if available, or by
5947       other means (whole-vector shifts or a scalar loop).
5948       The function also creates a new phi node at the loop exit to preserve
5949       loop-closed form, as illustrated below.
5950
5951      The flow at the entry to this function:
5952
5953         loop:
5954           vec_def = phi <vec_init, null>        # REDUCTION_PHI
5955           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5956           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5957         loop_exit:
5958           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5959           use <s_out0>
5960           use <s_out0>
5961
5962      The above is transformed by this function into:
5963
5964         loop:
5965           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
5966           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5967           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5968         loop_exit:
5969           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5970           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5971           v_out2 = reduce <v_out1>
5972           s_out3 = extract_field <v_out2, 0>
5973           s_out4 = adjust_result <s_out3>
5974           use <s_out4>
5975           use <s_out4>
5976 */
5977
5978 static void
5979 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5980                                   stmt_vec_info stmt_info,
5981                                   slp_tree slp_node,
5982                                   slp_instance slp_node_instance,
5983                                   edge loop_exit)
5984 {
5985   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5986   gcc_assert (reduc_info->is_reduc_info);
5987   /* For double reductions we need to get at the inner loop reduction
5988      stmt which has the meta info attached.  Our stmt_info is that of the
5989      loop-closed PHI of the inner loop which we remember as
5990      def for the reduction PHI generation.  */
5991   bool double_reduc = false;
5992   bool last_val_reduc_p = LOOP_VINFO_IV_EXIT (loop_vinfo) == loop_exit
5993                           && !LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo);
5994   stmt_vec_info rdef_info = stmt_info;
5995   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5996     {
5997       gcc_assert (!slp_node);
5998       double_reduc = true;
5999       stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
6000                                             (stmt_info->stmt, 0));
6001       stmt_info = vect_stmt_to_vectorize (stmt_info);
6002     }
6003   gphi *reduc_def_stmt
6004     = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
6005   code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
6006   internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
6007   tree vectype;
6008   machine_mode mode;
6009   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
6010   basic_block exit_bb;
6011   tree scalar_dest;
6012   tree scalar_type;
6013   gimple *new_phi = NULL, *phi = NULL;
6014   gimple_stmt_iterator exit_gsi;
6015   tree new_temp = NULL_TREE, new_name, new_scalar_dest;
6016   gimple *epilog_stmt = NULL;
6017   gimple *exit_phi;
6018   tree bitsize;
6019   tree def;
6020   tree orig_name, scalar_result;
6021   imm_use_iterator imm_iter, phi_imm_iter;
6022   use_operand_p use_p, phi_use_p;
6023   gimple *use_stmt;
6024   auto_vec<tree> reduc_inputs;
6025   int j, i;
6026   vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
6027   unsigned int group_size = 1, k;
6028   /* SLP reduction without reduction chain, e.g.,
6029      # a1 = phi <a2, a0>
6030      # b1 = phi <b2, b0>
6031      a2 = operation (a1)
6032      b2 = operation (b1)  */
6033   bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
6034   bool direct_slp_reduc;
6035   tree induction_index = NULL_TREE;
6036
6037   if (slp_node)
6038     group_size = SLP_TREE_LANES (slp_node);
6039
6040   if (nested_in_vect_loop_p (loop, stmt_info))
6041     {
6042       outer_loop = loop;
6043       loop = loop->inner;
6044       gcc_assert (!slp_node && double_reduc);
6045     }
6046
6047   vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
6048   gcc_assert (vectype);
6049   mode = TYPE_MODE (vectype);
6050
6051   tree induc_val = NULL_TREE;
6052   tree adjustment_def = NULL;
6053   if (slp_node)
6054     ;
6055   else
6056     {
6057       /* Optimize: for induction condition reduction, if we can't use zero
6058          for induc_val, use initial_def.  */
6059       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6060         induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
6061       else if (double_reduc)
6062         ;
6063       else
6064         adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
6065     }
6066
6067   stmt_vec_info single_live_out_stmt[] = { stmt_info };
6068   array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
6069   if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
6070       && loop_exit != LOOP_VINFO_IV_EXIT (loop_vinfo)
6071       /* ???  We should fend this off earlier.  For conversions we create
6072          multiple epilogues, one dead.  */
6073       && stmt_info == reduc_info->reduc_def)
6074     {
6075       gcc_assert (!slp_node);
6076       single_live_out_stmt[0] = reduc_info;
6077     }
6078   else
6079     {
6080       if (slp_reduc)
6081         /* All statements produce live-out values.  */
6082         live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
6083       else if (slp_node)
6084         {
6085           /* The last statement in the reduction chain produces the live-out
6086              value.  Note SLP optimization can shuffle scalar stmts to
6087              optimize permutations so we have to search for the last stmt.  */
6088           for (k = 0; k < group_size; ++k)
6089             if (!REDUC_GROUP_NEXT_ELEMENT (SLP_TREE_SCALAR_STMTS (slp_node)[k]))
6090               {
6091                 single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[k];
6092                 break;
6093               }
6094         }
6095     }
6096
6097   unsigned vec_num;
6098   int ncopies;
6099   if (slp_node)
6100     {
6101       vec_num = SLP_TREE_VEC_DEFS (slp_node_instance->reduc_phis).length ();
6102       ncopies = 1;
6103     }
6104   else
6105     {
6106       stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
6107       vec_num = 1;
6108       ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
6109     }
6110
6111   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
6112      which is updated with the current index of the loop for every match of
6113      the original loop's cond_expr (VEC_STMT).  This results in a vector
6114      containing the last time the condition passed for that vector lane.
6115      The first match will be a 1 to allow 0 to be used for non-matching
6116      indexes.  If there are no matches at all then the vector will be all
6117      zeroes.
6118
6119      PR92772: This algorithm is broken for architectures that support
6120      masked vectors, but do not provide fold_extract_last.  */
6121   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
6122     {
6123       auto_vec<std::pair<tree, bool>, 2> ccompares;
6124       stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
6125       cond_info = vect_stmt_to_vectorize (cond_info);
6126       while (cond_info != reduc_info)
6127         {
6128           if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
6129             {
6130               gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
6131               gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
6132               ccompares.safe_push
6133                 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
6134                                  STMT_VINFO_REDUC_IDX (cond_info) == 2));
6135             }
6136           cond_info
6137             = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
6138                                                  1 + STMT_VINFO_REDUC_IDX
6139                                                         (cond_info)));
6140           cond_info = vect_stmt_to_vectorize (cond_info);
6141         }
6142       gcc_assert (ccompares.length () != 0);
6143
6144       tree indx_before_incr, indx_after_incr;
6145       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
6146       int scalar_precision
6147         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
6148       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
6149       tree cr_index_vector_type = get_related_vectype_for_scalar_type
6150         (TYPE_MODE (vectype), cr_index_scalar_type,
6151          TYPE_VECTOR_SUBPARTS (vectype));
6152
6153       /* First we create a simple vector induction variable which starts
6154          with the values {1,2,3,...} (SERIES_VECT) and increments by the
6155          vector size (STEP).  */
6156
6157       /* Create a {1,2,3,...} vector.  */
6158       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
6159
6160       /* Create a vector of the step value.  */
6161       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
6162       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
6163
6164       /* Create an induction variable.  */
6165       gimple_stmt_iterator incr_gsi;
6166       bool insert_after;
6167       vect_iv_increment_position (loop_exit, &incr_gsi, &insert_after);
6168       create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, &incr_gsi,
6169                  insert_after, &indx_before_incr, &indx_after_incr);
6170
6171       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
6172          filled with zeros (VEC_ZERO).  */
6173
6174       /* Create a vector of 0s.  */
6175       tree zero = build_zero_cst (cr_index_scalar_type);
6176       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
6177
6178       /* Create a vector phi node.  */
6179       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
6180       new_phi = create_phi_node (new_phi_tree, loop->header);
6181       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
6182                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
6183
6184       /* Now take the condition from the loops original cond_exprs
6185          and produce a new cond_exprs (INDEX_COND_EXPR) which for
6186          every match uses values from the induction variable
6187          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
6188          (NEW_PHI_TREE).
6189          Finally, we update the phi (NEW_PHI_TREE) to take the value of
6190          the new cond_expr (INDEX_COND_EXPR).  */
6191       gimple_seq stmts = NULL;
6192       for (int i = ccompares.length () - 1; i != -1; --i)
6193         {
6194           tree ccompare = ccompares[i].first;
6195           if (ccompares[i].second)
6196             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
6197                                          cr_index_vector_type,
6198                                          ccompare,
6199                                          indx_before_incr, new_phi_tree);
6200           else
6201             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
6202                                          cr_index_vector_type,
6203                                          ccompare,
6204                                          new_phi_tree, indx_before_incr);
6205         }
6206       gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
6207
6208       /* Update the phi with the vec cond.  */
6209       induction_index = new_phi_tree;
6210       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
6211                    loop_latch_edge (loop), UNKNOWN_LOCATION);
6212     }
6213
6214   /* 2. Create epilog code.
6215         The reduction epilog code operates across the elements of the vector
6216         of partial results computed by the vectorized loop.
6217         The reduction epilog code consists of:
6218
6219         step 1: compute the scalar result in a vector (v_out2)
6220         step 2: extract the scalar result (s_out3) from the vector (v_out2)
6221         step 3: adjust the scalar result (s_out3) if needed.
6222
6223         Step 1 can be accomplished using one the following three schemes:
6224           (scheme 1) using reduc_fn, if available.
6225           (scheme 2) using whole-vector shifts, if available.
6226           (scheme 3) using a scalar loop. In this case steps 1+2 above are
6227                      combined.
6228
6229           The overall epilog code looks like this:
6230
6231           s_out0 = phi <s_loop>         # original EXIT_PHI
6232           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
6233           v_out2 = reduce <v_out1>              # step 1
6234           s_out3 = extract_field <v_out2, 0>    # step 2
6235           s_out4 = adjust_result <s_out3>       # step 3
6236
6237           (step 3 is optional, and steps 1 and 2 may be combined).
6238           Lastly, the uses of s_out0 are replaced by s_out4.  */
6239
6240
6241   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
6242          v_out1 = phi <VECT_DEF>
6243          Store them in NEW_PHIS.  */
6244   if (double_reduc)
6245     loop = outer_loop;
6246   /* We need to reduce values in all exits.  */
6247   exit_bb = loop_exit->dest;
6248   exit_gsi = gsi_after_labels (exit_bb);
6249   reduc_inputs.create (slp_node ? vec_num : ncopies);
6250   vec <gimple *> vec_stmts = vNULL;
6251   for (unsigned i = 0; i < vec_num; i++)
6252     {
6253       gimple_seq stmts = NULL;
6254       def = vect_get_vect_def (rdef_info, slp_node, slp_node_instance,
6255                                last_val_reduc_p, i, vec_stmts);
6256       for (j = 0; j < ncopies; j++)
6257         {
6258           tree new_def = copy_ssa_name (def);
6259           phi = create_phi_node (new_def, exit_bb);
6260           if (j)
6261             def = gimple_get_lhs (vec_stmts[j]);
6262           if (LOOP_VINFO_IV_EXIT (loop_vinfo) == loop_exit)
6263             SET_PHI_ARG_DEF (phi, loop_exit->dest_idx, def);
6264           else
6265             {
6266               for (unsigned k = 0; k < gimple_phi_num_args (phi); k++)
6267                 SET_PHI_ARG_DEF (phi, k, def);
6268             }
6269           new_def = gimple_convert (&stmts, vectype, new_def);
6270           reduc_inputs.quick_push (new_def);
6271         }
6272       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6273     }
6274
6275   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
6276          (i.e. when reduc_fn is not available) and in the final adjustment
6277          code (if needed).  Also get the original scalar reduction variable as
6278          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
6279          represents a reduction pattern), the tree-code and scalar-def are
6280          taken from the original stmt that the pattern-stmt (STMT) replaces.
6281          Otherwise (it is a regular reduction) - the tree-code and scalar-def
6282          are taken from STMT.  */
6283
6284   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6285   if (orig_stmt_info != stmt_info)
6286     {
6287       /* Reduction pattern  */
6288       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6289       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
6290     }
6291
6292   scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
6293   scalar_type = TREE_TYPE (scalar_dest);
6294   scalar_results.truncate (0);
6295   scalar_results.reserve_exact (group_size);
6296   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
6297   bitsize = TYPE_SIZE (scalar_type);
6298
6299   /* True if we should implement SLP_REDUC using native reduction operations
6300      instead of scalar operations.  */
6301   direct_slp_reduc = (reduc_fn != IFN_LAST
6302                       && slp_reduc
6303                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
6304
6305   /* In case of reduction chain, e.g.,
6306      # a1 = phi <a3, a0>
6307      a2 = operation (a1)
6308      a3 = operation (a2),
6309
6310      we may end up with more than one vector result.  Here we reduce them
6311      to one vector.
6312
6313      The same is true for a SLP reduction, e.g.,
6314      # a1 = phi <a2, a0>
6315      # b1 = phi <b2, b0>
6316      a2 = operation (a1)
6317      b2 = operation (a2),
6318
6319      where we can end up with more than one vector as well.  We can
6320      easily accumulate vectors when the number of vector elements is
6321      a multiple of the SLP group size.
6322
6323      The same is true if we couldn't use a single defuse cycle.  */
6324   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6325       || direct_slp_reduc
6326       || (slp_reduc
6327           && constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype), group_size))
6328       || ncopies > 1)
6329     {
6330       gimple_seq stmts = NULL;
6331       tree single_input = reduc_inputs[0];
6332       for (k = 1; k < reduc_inputs.length (); k++)
6333         single_input = gimple_build (&stmts, code, vectype,
6334                                      single_input, reduc_inputs[k]);
6335       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6336
6337       reduc_inputs.truncate (0);
6338       reduc_inputs.safe_push (single_input);
6339     }
6340
6341   tree orig_reduc_input = reduc_inputs[0];
6342
6343   /* If this loop is an epilogue loop that can be skipped after the
6344      main loop, we can only share a reduction operation between the
6345      main loop and the epilogue if we put it at the target of the
6346      skip edge.
6347
6348      We can still reuse accumulators if this check fails.  Doing so has
6349      the minor(?) benefit of making the epilogue loop's scalar result
6350      independent of the main loop's scalar result.  */
6351   bool unify_with_main_loop_p = false;
6352   if (reduc_info->reused_accumulator
6353       && loop_vinfo->skip_this_loop_edge
6354       && single_succ_p (exit_bb)
6355       && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
6356     {
6357       unify_with_main_loop_p = true;
6358
6359       basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
6360       reduc_inputs[0] = make_ssa_name (vectype);
6361       gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
6362       add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
6363                    UNKNOWN_LOCATION);
6364       add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
6365                    loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
6366       exit_gsi = gsi_after_labels (reduc_block);
6367     }
6368
6369   /* Shouldn't be used beyond this point.  */
6370   exit_bb = nullptr;
6371
6372   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6373       && reduc_fn != IFN_LAST)
6374     {
6375       /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
6376          various data values where the condition matched and another vector
6377          (INDUCTION_INDEX) containing all the indexes of those matches.  We
6378          need to extract the last matching index (which will be the index with
6379          highest value) and use this to index into the data vector.
6380          For the case where there were no matches, the data vector will contain
6381          all default values and the index vector will be all zeros.  */
6382
6383       /* Get various versions of the type of the vector of indexes.  */
6384       tree index_vec_type = TREE_TYPE (induction_index);
6385       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
6386       tree index_scalar_type = TREE_TYPE (index_vec_type);
6387       tree index_vec_cmp_type = truth_type_for (index_vec_type);
6388
6389       /* Get an unsigned integer version of the type of the data vector.  */
6390       int scalar_precision
6391         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6392       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
6393       tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
6394                                                 vectype);
6395
6396       /* First we need to create a vector (ZERO_VEC) of zeros and another
6397          vector (MAX_INDEX_VEC) filled with the last matching index, which we
6398          can create using a MAX reduction and then expanding.
6399          In the case where the loop never made any matches, the max index will
6400          be zero.  */
6401
6402       /* Vector of {0, 0, 0,...}.  */
6403       tree zero_vec = build_zero_cst (vectype);
6404
6405       /* Find maximum value from the vector of found indexes.  */
6406       tree max_index = make_ssa_name (index_scalar_type);
6407       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6408                                                           1, induction_index);
6409       gimple_call_set_lhs (max_index_stmt, max_index);
6410       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
6411
6412       /* Vector of {max_index, max_index, max_index,...}.  */
6413       tree max_index_vec = make_ssa_name (index_vec_type);
6414       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
6415                                                       max_index);
6416       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
6417                                                         max_index_vec_rhs);
6418       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
6419
6420       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
6421          with the vector (INDUCTION_INDEX) of found indexes, choosing values
6422          from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
6423          otherwise.  Only one value should match, resulting in a vector
6424          (VEC_COND) with one data value and the rest zeros.
6425          In the case where the loop never made any matches, every index will
6426          match, resulting in a vector with all data values (which will all be
6427          the default value).  */
6428
6429       /* Compare the max index vector to the vector of found indexes to find
6430          the position of the max value.  */
6431       tree vec_compare = make_ssa_name (index_vec_cmp_type);
6432       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
6433                                                       induction_index,
6434                                                       max_index_vec);
6435       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
6436
6437       /* Use the compare to choose either values from the data vector or
6438          zero.  */
6439       tree vec_cond = make_ssa_name (vectype);
6440       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
6441                                                    vec_compare,
6442                                                    reduc_inputs[0],
6443                                                    zero_vec);
6444       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
6445
6446       /* Finally we need to extract the data value from the vector (VEC_COND)
6447          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
6448          reduction, but because this doesn't exist, we can use a MAX reduction
6449          instead.  The data value might be signed or a float so we need to cast
6450          it first.
6451          In the case where the loop never made any matches, the data values are
6452          all identical, and so will reduce down correctly.  */
6453
6454       /* Make the matched data values unsigned.  */
6455       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
6456       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
6457                                        vec_cond);
6458       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
6459                                                         VIEW_CONVERT_EXPR,
6460                                                         vec_cond_cast_rhs);
6461       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
6462
6463       /* Reduce down to a scalar value.  */
6464       tree data_reduc = make_ssa_name (scalar_type_unsigned);
6465       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6466                                                            1, vec_cond_cast);
6467       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
6468       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
6469
6470       /* Convert the reduced value back to the result type and set as the
6471          result.  */
6472       gimple_seq stmts = NULL;
6473       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
6474                                data_reduc);
6475       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6476       scalar_results.safe_push (new_temp);
6477     }
6478   else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6479            && reduc_fn == IFN_LAST)
6480     {
6481       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
6482          idx = 0;
6483          idx_val = induction_index[0];
6484          val = data_reduc[0];
6485          for (idx = 0, val = init, i = 0; i < nelts; ++i)
6486            if (induction_index[i] > idx_val)
6487              val = data_reduc[i], idx_val = induction_index[i];
6488          return val;  */
6489
6490       tree data_eltype = TREE_TYPE (vectype);
6491       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
6492       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
6493       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
6494       /* Enforced by vectorizable_reduction, which ensures we have target
6495          support before allowing a conditional reduction on variable-length
6496          vectors.  */
6497       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
6498       tree idx_val = NULL_TREE, val = NULL_TREE;
6499       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
6500         {
6501           tree old_idx_val = idx_val;
6502           tree old_val = val;
6503           idx_val = make_ssa_name (idx_eltype);
6504           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
6505                                              build3 (BIT_FIELD_REF, idx_eltype,
6506                                                      induction_index,
6507                                                      bitsize_int (el_size),
6508                                                      bitsize_int (off)));
6509           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6510           val = make_ssa_name (data_eltype);
6511           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
6512                                              build3 (BIT_FIELD_REF,
6513                                                      data_eltype,
6514                                                      reduc_inputs[0],
6515                                                      bitsize_int (el_size),
6516                                                      bitsize_int (off)));
6517           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6518           if (off != 0)
6519             {
6520               tree new_idx_val = idx_val;
6521               if (off != v_size - el_size)
6522                 {
6523                   new_idx_val = make_ssa_name (idx_eltype);
6524                   epilog_stmt = gimple_build_assign (new_idx_val,
6525                                                      MAX_EXPR, idx_val,
6526                                                      old_idx_val);
6527                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6528                 }
6529               tree cond = make_ssa_name (boolean_type_node);
6530               epilog_stmt = gimple_build_assign (cond, GT_EXPR,
6531                                                  idx_val, old_idx_val);
6532               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6533               tree new_val = make_ssa_name (data_eltype);
6534               epilog_stmt = gimple_build_assign (new_val, COND_EXPR,
6535                                                  cond, val, old_val);
6536               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6537               idx_val = new_idx_val;
6538               val = new_val;
6539             }
6540         }
6541       /* Convert the reduced value back to the result type and set as the
6542          result.  */
6543       gimple_seq stmts = NULL;
6544       val = gimple_convert (&stmts, scalar_type, val);
6545       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6546       scalar_results.safe_push (val);
6547     }
6548
6549   /* 2.3 Create the reduction code, using one of the three schemes described
6550          above. In SLP we simply need to extract all the elements from the
6551          vector (without reducing them), so we use scalar shifts.  */
6552   else if (reduc_fn != IFN_LAST && !slp_reduc)
6553     {
6554       tree tmp;
6555       tree vec_elem_type;
6556
6557       /* Case 1:  Create:
6558          v_out2 = reduc_expr <v_out1>  */
6559
6560       if (dump_enabled_p ())
6561         dump_printf_loc (MSG_NOTE, vect_location,
6562                          "Reduce using direct vector reduction.\n");
6563
6564       gimple_seq stmts = NULL;
6565       vec_elem_type = TREE_TYPE (vectype);
6566       new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
6567                                vec_elem_type, reduc_inputs[0]);
6568       new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6569       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6570
6571       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6572           && induc_val)
6573         {
6574           /* Earlier we set the initial value to be a vector if induc_val
6575              values.  Check the result and if it is induc_val then replace
6576              with the original initial value, unless induc_val is
6577              the same as initial_def already.  */
6578           tree zcompare = make_ssa_name (boolean_type_node);
6579           epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
6580                                              new_temp, induc_val);
6581           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6582           tree initial_def = reduc_info->reduc_initial_values[0];
6583           tmp = make_ssa_name (new_scalar_dest);
6584           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6585                                              initial_def, new_temp);
6586           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6587           new_temp = tmp;
6588         }
6589
6590       scalar_results.safe_push (new_temp);
6591     }
6592   else if (direct_slp_reduc)
6593     {
6594       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
6595          with the elements for other SLP statements replaced with the
6596          neutral value.  We can then do a normal reduction on each vector.  */
6597
6598       /* Enforced by vectorizable_reduction.  */
6599       gcc_assert (reduc_inputs.length () == 1);
6600       gcc_assert (pow2p_hwi (group_size));
6601
6602       gimple_seq seq = NULL;
6603
6604       /* Build a vector {0, 1, 2, ...}, with the same number of elements
6605          and the same element size as VECTYPE.  */
6606       tree index = build_index_vector (vectype, 0, 1);
6607       tree index_type = TREE_TYPE (index);
6608       tree index_elt_type = TREE_TYPE (index_type);
6609       tree mask_type = truth_type_for (index_type);
6610
6611       /* Create a vector that, for each element, identifies which of
6612          the REDUC_GROUP_SIZE results should use it.  */
6613       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
6614       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
6615                             build_vector_from_val (index_type, index_mask));
6616
6617       /* Get a neutral vector value.  This is simply a splat of the neutral
6618          scalar value if we have one, otherwise the initial scalar value
6619          is itself a neutral value.  */
6620       tree vector_identity = NULL_TREE;
6621       tree neutral_op = NULL_TREE;
6622       if (slp_node)
6623         {
6624           tree initial_value = NULL_TREE;
6625           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6626             initial_value = reduc_info->reduc_initial_values[0];
6627           neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
6628                                                  initial_value, false);
6629         }
6630       if (neutral_op)
6631         vector_identity = gimple_build_vector_from_val (&seq, vectype,
6632                                                         neutral_op);
6633       for (unsigned int i = 0; i < group_size; ++i)
6634         {
6635           /* If there's no univeral neutral value, we can use the
6636              initial scalar value from the original PHI.  This is used
6637              for MIN and MAX reduction, for example.  */
6638           if (!neutral_op)
6639             {
6640               tree scalar_value = reduc_info->reduc_initial_values[i];
6641               scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
6642                                              scalar_value);
6643               vector_identity = gimple_build_vector_from_val (&seq, vectype,
6644                                                               scalar_value);
6645             }
6646
6647           /* Calculate the equivalent of:
6648
6649              sel[j] = (index[j] == i);
6650
6651              which selects the elements of REDUC_INPUTS[0] that should
6652              be included in the result.  */
6653           tree compare_val = build_int_cst (index_elt_type, i);
6654           compare_val = build_vector_from_val (index_type, compare_val);
6655           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
6656                                    index, compare_val);
6657
6658           /* Calculate the equivalent of:
6659
6660              vec = seq ? reduc_inputs[0] : vector_identity;
6661
6662              VEC is now suitable for a full vector reduction.  */
6663           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
6664                                    sel, reduc_inputs[0], vector_identity);
6665
6666           /* Do the reduction and convert it to the appropriate type.  */
6667           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
6668                                       TREE_TYPE (vectype), vec);
6669           scalar = gimple_convert (&seq, scalar_type, scalar);
6670           scalar_results.safe_push (scalar);
6671         }
6672       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
6673     }
6674   else
6675     {
6676       bool reduce_with_shift;
6677       tree vec_temp;
6678
6679       gcc_assert (slp_reduc || reduc_inputs.length () == 1);
6680
6681       /* See if the target wants to do the final (shift) reduction
6682          in a vector mode of smaller size and first reduce upper/lower
6683          halves against each other.  */
6684       enum machine_mode mode1 = mode;
6685       tree stype = TREE_TYPE (vectype);
6686       unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6687       unsigned nunits1 = nunits;
6688       if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
6689           && reduc_inputs.length () == 1)
6690         {
6691           nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6692           /* For SLP reductions we have to make sure lanes match up, but
6693              since we're doing individual element final reduction reducing
6694              vector width here is even more important.
6695              ???  We can also separate lanes with permutes, for the common
6696              case of power-of-two group-size odd/even extracts would work.  */
6697           if (slp_reduc && nunits != nunits1)
6698             {
6699               nunits1 = least_common_multiple (nunits1, group_size);
6700               gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
6701             }
6702         }
6703       if (!slp_reduc
6704           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
6705         nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6706
6707       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
6708                                                            stype, nunits1);
6709       reduce_with_shift = have_whole_vector_shift (mode1);
6710       if (!VECTOR_MODE_P (mode1)
6711           || !directly_supported_p (code, vectype1))
6712         reduce_with_shift = false;
6713
6714       /* First reduce the vector to the desired vector size we should
6715          do shift reduction on by combining upper and lower halves.  */
6716       gimple_seq stmts = NULL;
6717       new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
6718                                              code, &stmts);
6719       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6720       reduc_inputs[0] = new_temp;
6721
6722       if (reduce_with_shift && !slp_reduc)
6723         {
6724           int element_bitsize = tree_to_uhwi (bitsize);
6725           /* Enforced by vectorizable_reduction, which disallows SLP reductions
6726              for variable-length vectors and also requires direct target support
6727              for loop reductions.  */
6728           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6729           int nelements = vec_size_in_bits / element_bitsize;
6730           vec_perm_builder sel;
6731           vec_perm_indices indices;
6732
6733           int elt_offset;
6734
6735           tree zero_vec = build_zero_cst (vectype1);
6736           /* Case 2: Create:
6737              for (offset = nelements/2; offset >= 1; offset/=2)
6738                 {
6739                   Create:  va' = vec_shift <va, offset>
6740                   Create:  va = vop <va, va'>
6741                 }  */
6742
6743           tree rhs;
6744
6745           if (dump_enabled_p ())
6746             dump_printf_loc (MSG_NOTE, vect_location,
6747                              "Reduce using vector shifts\n");
6748
6749           gimple_seq stmts = NULL;
6750           new_temp = gimple_convert (&stmts, vectype1, new_temp);
6751           for (elt_offset = nelements / 2;
6752                elt_offset >= 1;
6753                elt_offset /= 2)
6754             {
6755               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
6756               indices.new_vector (sel, 2, nelements);
6757               tree mask = vect_gen_perm_mask_any (vectype1, indices);
6758               new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
6759                                        new_temp, zero_vec, mask);
6760               new_temp = gimple_build (&stmts, code,
6761                                        vectype1, new_name, new_temp);
6762             }
6763           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6764
6765           /* 2.4  Extract the final scalar result.  Create:
6766              s_out3 = extract_field <v_out2, bitpos>  */
6767
6768           if (dump_enabled_p ())
6769             dump_printf_loc (MSG_NOTE, vect_location,
6770                              "extract scalar result\n");
6771
6772           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
6773                         bitsize, bitsize_zero_node);
6774           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
6775           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
6776           gimple_assign_set_lhs (epilog_stmt, new_temp);
6777           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6778           scalar_results.safe_push (new_temp);
6779         }
6780       else
6781         {
6782           /* Case 3: Create:
6783              s = extract_field <v_out2, 0>
6784              for (offset = element_size;
6785                   offset < vector_size;
6786                   offset += element_size;)
6787                {
6788                  Create:  s' = extract_field <v_out2, offset>
6789                  Create:  s = op <s, s'>  // For non SLP cases
6790                }  */
6791
6792           if (dump_enabled_p ())
6793             dump_printf_loc (MSG_NOTE, vect_location,
6794                              "Reduce using scalar code.\n");
6795
6796           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6797           int element_bitsize = tree_to_uhwi (bitsize);
6798           tree compute_type = TREE_TYPE (vectype);
6799           gimple_seq stmts = NULL;
6800           FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
6801             {
6802               int bit_offset;
6803               new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
6804                                        vec_temp, bitsize, bitsize_zero_node);
6805
6806               /* In SLP we don't need to apply reduction operation, so we just
6807                  collect s' values in SCALAR_RESULTS.  */
6808               if (slp_reduc)
6809                 scalar_results.safe_push (new_temp);
6810
6811               for (bit_offset = element_bitsize;
6812                    bit_offset < vec_size_in_bits;
6813                    bit_offset += element_bitsize)
6814                 {
6815                   tree bitpos = bitsize_int (bit_offset);
6816                   new_name = gimple_build (&stmts, BIT_FIELD_REF,
6817                                            compute_type, vec_temp,
6818                                            bitsize, bitpos);
6819                   if (slp_reduc)
6820                     {
6821                       /* In SLP we don't need to apply reduction operation, so
6822                          we just collect s' values in SCALAR_RESULTS.  */
6823                       new_temp = new_name;
6824                       scalar_results.safe_push (new_name);
6825                     }
6826                   else
6827                     new_temp = gimple_build (&stmts, code, compute_type,
6828                                              new_name, new_temp);
6829                 }
6830             }
6831
6832           /* The only case where we need to reduce scalar results in SLP, is
6833              unrolling.  If the size of SCALAR_RESULTS is greater than
6834              REDUC_GROUP_SIZE, we reduce them combining elements modulo
6835              REDUC_GROUP_SIZE.  */
6836           if (slp_reduc)
6837             {
6838               tree res, first_res, new_res;
6839
6840               /* Reduce multiple scalar results in case of SLP unrolling.  */
6841               for (j = group_size; scalar_results.iterate (j, &res);
6842                    j++)
6843                 {
6844                   first_res = scalar_results[j % group_size];
6845                   new_res = gimple_build (&stmts, code, compute_type,
6846                                           first_res, res);
6847                   scalar_results[j % group_size] = new_res;
6848                 }
6849               scalar_results.truncate (group_size);
6850               for (k = 0; k < group_size; k++)
6851                 scalar_results[k] = gimple_convert (&stmts, scalar_type,
6852                                                     scalar_results[k]);
6853             }
6854           else
6855             {
6856               /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
6857               new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6858               scalar_results.safe_push (new_temp);
6859             }
6860
6861           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6862         }
6863
6864       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6865           && induc_val)
6866         {
6867           /* Earlier we set the initial value to be a vector if induc_val
6868              values.  Check the result and if it is induc_val then replace
6869              with the original initial value, unless induc_val is
6870              the same as initial_def already.  */
6871           tree zcompare = make_ssa_name (boolean_type_node);
6872           epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR, new_temp,
6873                                              induc_val);
6874           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6875           tree initial_def = reduc_info->reduc_initial_values[0];
6876           tree tmp = make_ssa_name (new_scalar_dest);
6877           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6878                                              initial_def, new_temp);
6879           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6880           scalar_results[0] = tmp;
6881         }
6882     }
6883
6884   /* 2.5 Adjust the final result by the initial value of the reduction
6885          variable. (When such adjustment is not needed, then
6886          'adjustment_def' is zero).  For example, if code is PLUS we create:
6887          new_temp = loop_exit_def + adjustment_def  */
6888
6889   if (adjustment_def)
6890     {
6891       gcc_assert (!slp_reduc);
6892       gimple_seq stmts = NULL;
6893       if (double_reduc)
6894         {
6895           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6896           adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6897           new_temp = gimple_build (&stmts, code, vectype,
6898                                    reduc_inputs[0], adjustment_def);
6899         }
6900       else
6901         {
6902           new_temp = scalar_results[0];
6903           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6904           adjustment_def = gimple_convert (&stmts, TREE_TYPE (vectype),
6905                                            adjustment_def);
6906           new_temp = gimple_convert (&stmts, TREE_TYPE (vectype), new_temp);
6907           new_temp = gimple_build (&stmts, code, TREE_TYPE (vectype),
6908                                    new_temp, adjustment_def);
6909           new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6910         }
6911
6912       epilog_stmt = gimple_seq_last_stmt (stmts);
6913       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6914       scalar_results[0] = new_temp;
6915     }
6916
6917   /* Record this operation if it could be reused by the epilogue loop.  */
6918   if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION
6919       && reduc_inputs.length () == 1)
6920     loop_vinfo->reusable_accumulators.put (scalar_results[0],
6921                                            { orig_reduc_input, reduc_info });
6922
6923   if (double_reduc)
6924     loop = outer_loop;
6925
6926   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
6927           phis with new adjusted scalar results, i.e., replace use <s_out0>
6928           with use <s_out4>.
6929
6930      Transform:
6931         loop_exit:
6932           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6933           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6934           v_out2 = reduce <v_out1>
6935           s_out3 = extract_field <v_out2, 0>
6936           s_out4 = adjust_result <s_out3>
6937           use <s_out0>
6938           use <s_out0>
6939
6940      into:
6941
6942         loop_exit:
6943           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6944           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6945           v_out2 = reduce <v_out1>
6946           s_out3 = extract_field <v_out2, 0>
6947           s_out4 = adjust_result <s_out3>
6948           use <s_out4>
6949           use <s_out4> */
6950
6951   gcc_assert (live_out_stmts.size () == scalar_results.length ());
6952   auto_vec<gimple *> phis;
6953   for (k = 0; k < live_out_stmts.size (); k++)
6954     {
6955       stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6956       scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6957
6958       /* Find the loop-closed-use at the loop exit of the original scalar
6959          result.  (The reduction result is expected to have two immediate uses,
6960          one at the latch block, and one at the loop exit).  For double
6961          reductions we are looking for exit phis of the outer loop.  */
6962       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6963         {
6964           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6965             {
6966               if (!is_gimple_debug (USE_STMT (use_p)))
6967                 phis.safe_push (USE_STMT (use_p));
6968             }
6969           else
6970             {
6971               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6972                 {
6973                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
6974
6975                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6976                     {
6977                       if (!flow_bb_inside_loop_p (loop,
6978                                              gimple_bb (USE_STMT (phi_use_p)))
6979                           && !is_gimple_debug (USE_STMT (phi_use_p)))
6980                         phis.safe_push (USE_STMT (phi_use_p));
6981                     }
6982                 }
6983             }
6984         }
6985
6986       FOR_EACH_VEC_ELT (phis, i, exit_phi)
6987         {
6988           /* Replace the uses:  */
6989           orig_name = PHI_RESULT (exit_phi);
6990
6991           /* Look for a single use at the target of the skip edge.  */
6992           if (unify_with_main_loop_p)
6993             {
6994               use_operand_p use_p;
6995               gimple *user;
6996               if (!single_imm_use (orig_name, &use_p, &user))
6997                 gcc_unreachable ();
6998               orig_name = gimple_get_lhs (user);
6999             }
7000
7001           scalar_result = scalar_results[k];
7002           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
7003             {
7004               FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
7005                 SET_USE (use_p, scalar_result);
7006               update_stmt (use_stmt);
7007             }
7008         }
7009
7010       phis.truncate (0);
7011     }
7012 }
7013
7014 /* Return a vector of type VECTYPE that is equal to the vector select
7015    operation "MASK ? VEC : IDENTITY".  Insert the select statements
7016    before GSI.  */
7017
7018 static tree
7019 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
7020                      tree vec, tree identity)
7021 {
7022   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
7023   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
7024                                           mask, vec, identity);
7025   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
7026   return cond;
7027 }
7028
7029 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
7030    order, starting with LHS.  Insert the extraction statements before GSI and
7031    associate the new scalar SSA names with variable SCALAR_DEST.
7032    If MASK is nonzero mask the input and then operate on it unconditionally.
7033    Return the SSA name for the result.  */
7034
7035 static tree
7036 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
7037                        tree_code code, tree lhs, tree vector_rhs,
7038                        tree mask)
7039 {
7040   tree vectype = TREE_TYPE (vector_rhs);
7041   tree scalar_type = TREE_TYPE (vectype);
7042   tree bitsize = TYPE_SIZE (scalar_type);
7043   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
7044   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
7045
7046   /* Re-create a VEC_COND_EXPR to mask the input here in order to be able
7047      to perform an unconditional element-wise reduction of it.  */
7048   if (mask)
7049     {
7050       tree masked_vector_rhs = make_temp_ssa_name (vectype, NULL,
7051                                                    "masked_vector_rhs");
7052       tree neutral_op = neutral_op_for_reduction (scalar_type, code, NULL_TREE,
7053                                                   false);
7054       tree vector_identity = build_vector_from_val (vectype, neutral_op);
7055       gassign *select = gimple_build_assign (masked_vector_rhs, VEC_COND_EXPR,
7056                                              mask, vector_rhs, vector_identity);
7057       gsi_insert_before (gsi, select, GSI_SAME_STMT);
7058       vector_rhs = masked_vector_rhs;
7059     }
7060
7061   for (unsigned HOST_WIDE_INT bit_offset = 0;
7062        bit_offset < vec_size_in_bits;
7063        bit_offset += element_bitsize)
7064     {
7065       tree bitpos = bitsize_int (bit_offset);
7066       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
7067                          bitsize, bitpos);
7068
7069       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
7070       rhs = make_ssa_name (scalar_dest, stmt);
7071       gimple_assign_set_lhs (stmt, rhs);
7072       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
7073
7074       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
7075       tree new_name = make_ssa_name (scalar_dest, stmt);
7076       gimple_assign_set_lhs (stmt, new_name);
7077       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
7078       lhs = new_name;
7079     }
7080   return lhs;
7081 }
7082
7083 /* Get a masked internal function equivalent to REDUC_FN.  VECTYPE_IN is the
7084    type of the vector input.  */
7085
7086 static internal_fn
7087 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
7088 {
7089   internal_fn mask_reduc_fn;
7090   internal_fn mask_len_reduc_fn;
7091
7092   switch (reduc_fn)
7093     {
7094     case IFN_FOLD_LEFT_PLUS:
7095       mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
7096       mask_len_reduc_fn = IFN_MASK_LEN_FOLD_LEFT_PLUS;
7097       break;
7098
7099     default:
7100       return IFN_LAST;
7101     }
7102
7103   if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
7104                                       OPTIMIZE_FOR_SPEED))
7105     return mask_reduc_fn;
7106   if (direct_internal_fn_supported_p (mask_len_reduc_fn, vectype_in,
7107                                       OPTIMIZE_FOR_SPEED))
7108     return mask_len_reduc_fn;
7109   return IFN_LAST;
7110 }
7111
7112 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
7113    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
7114    statement.  CODE is the operation performed by STMT_INFO and OPS are
7115    its scalar operands.  REDUC_INDEX is the index of the operand in
7116    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
7117    implements in-order reduction, or IFN_LAST if we should open-code it.
7118    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
7119    that should be used to control the operation in a fully-masked loop.  */
7120
7121 static bool
7122 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
7123                                stmt_vec_info stmt_info,
7124                                gimple_stmt_iterator *gsi,
7125                                gimple **vec_stmt, slp_tree slp_node,
7126                                gimple *reduc_def_stmt,
7127                                code_helper code, internal_fn reduc_fn,
7128                                tree *ops, int num_ops, tree vectype_in,
7129                                int reduc_index, vec_loop_masks *masks,
7130                                vec_loop_lens *lens)
7131 {
7132   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7133   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7134   internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
7135
7136   int ncopies;
7137   if (slp_node)
7138     ncopies = 1;
7139   else
7140     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7141
7142   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
7143   gcc_assert (ncopies == 1);
7144
7145   bool is_cond_op = false;
7146   if (!code.is_tree_code ())
7147     {
7148       code = conditional_internal_fn_code (internal_fn (code));
7149       gcc_assert (code != ERROR_MARK);
7150       is_cond_op = true;
7151     }
7152
7153   gcc_assert (TREE_CODE_LENGTH (tree_code (code)) == binary_op);
7154
7155   if (slp_node)
7156     {
7157       if (is_cond_op)
7158         {
7159           if (dump_enabled_p ())
7160             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7161                              "fold-left reduction on SLP not supported.\n");
7162           return false;
7163         }
7164
7165       gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
7166                             TYPE_VECTOR_SUBPARTS (vectype_in)));
7167     }
7168
7169   /* The operands either come from a binary operation or an IFN_COND operation.
7170      The former is a gimple assign with binary rhs and the latter is a
7171      gimple call with four arguments.  */
7172   gcc_assert (num_ops == 2 || num_ops == 4);
7173   tree op0, opmask;
7174   if (!is_cond_op)
7175     op0 = ops[1 - reduc_index];
7176   else
7177     {
7178       op0 = ops[2 + (1 - reduc_index)];
7179       opmask = ops[0];
7180       gcc_assert (!slp_node);
7181     }
7182
7183   int group_size = 1;
7184   stmt_vec_info scalar_dest_def_info;
7185   auto_vec<tree> vec_oprnds0, vec_opmask;
7186   if (slp_node)
7187     {
7188       auto_vec<vec<tree> > vec_defs (2);
7189       vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
7190       vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
7191       vec_defs[0].release ();
7192       vec_defs[1].release ();
7193       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7194       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
7195     }
7196   else
7197     {
7198       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7199                                      op0, &vec_oprnds0);
7200       scalar_dest_def_info = stmt_info;
7201
7202       /* For an IFN_COND_OP we also need the vector mask operand.  */
7203       if (is_cond_op)
7204           vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7205                                          opmask, &vec_opmask);
7206     }
7207
7208   gimple *sdef = vect_orig_stmt (scalar_dest_def_info)->stmt;
7209   tree scalar_dest = gimple_get_lhs (sdef);
7210   tree scalar_type = TREE_TYPE (scalar_dest);
7211   tree reduc_var = gimple_phi_result (reduc_def_stmt);
7212
7213   int vec_num = vec_oprnds0.length ();
7214   gcc_assert (vec_num == 1 || slp_node);
7215   tree vec_elem_type = TREE_TYPE (vectype_out);
7216   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
7217
7218   tree vector_identity = NULL_TREE;
7219   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7220     {
7221       vector_identity = build_zero_cst (vectype_out);
7222       if (!HONOR_SIGNED_ZEROS (vectype_out))
7223         ;
7224       else
7225         {
7226           gcc_assert (!HONOR_SIGN_DEPENDENT_ROUNDING (vectype_out));
7227           vector_identity = const_unop (NEGATE_EXPR, vectype_out,
7228                                         vector_identity);
7229         }
7230     }
7231
7232   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
7233   int i;
7234   tree def0;
7235   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7236     {
7237       gimple *new_stmt;
7238       tree mask = NULL_TREE;
7239       tree len = NULL_TREE;
7240       tree bias = NULL_TREE;
7241       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7242         mask = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num, vectype_in, i);
7243       else if (is_cond_op)
7244         mask = vec_opmask[0];
7245       if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
7246         {
7247           len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
7248                                    i, 1);
7249           signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
7250           bias = build_int_cst (intQI_type_node, biasval);
7251           if (!is_cond_op)
7252             mask = build_minus_one_cst (truth_type_for (vectype_in));
7253         }
7254
7255       /* Handle MINUS by adding the negative.  */
7256       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
7257         {
7258           tree negated = make_ssa_name (vectype_out);
7259           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
7260           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
7261           def0 = negated;
7262         }
7263
7264       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
7265           && mask && mask_reduc_fn == IFN_LAST)
7266         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
7267                                     vector_identity);
7268
7269       /* On the first iteration the input is simply the scalar phi
7270          result, and for subsequent iterations it is the output of
7271          the preceding operation.  */
7272       if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
7273         {
7274           if (mask && len && mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
7275             new_stmt = gimple_build_call_internal (mask_reduc_fn, 5, reduc_var,
7276                                                    def0, mask, len, bias);
7277           else if (mask && mask_reduc_fn == IFN_MASK_FOLD_LEFT_PLUS)
7278             new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
7279                                                    def0, mask);
7280           else
7281             new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
7282                                                    def0);
7283           /* For chained SLP reductions the output of the previous reduction
7284              operation serves as the input of the next. For the final statement
7285              the output cannot be a temporary - we reuse the original
7286              scalar destination of the last statement.  */
7287           if (i != vec_num - 1)
7288             {
7289               gimple_set_lhs (new_stmt, scalar_dest_var);
7290               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
7291               gimple_set_lhs (new_stmt, reduc_var);
7292             }
7293         }
7294       else
7295         {
7296           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var,
7297                                              tree_code (code), reduc_var, def0,
7298                                              mask);
7299           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
7300           /* Remove the statement, so that we can use the same code paths
7301              as for statements that we've just created.  */
7302           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
7303           gsi_remove (&tmp_gsi, true);
7304         }
7305
7306       if (i == vec_num - 1)
7307         {
7308           gimple_set_lhs (new_stmt, scalar_dest);
7309           vect_finish_replace_stmt (loop_vinfo,
7310                                     scalar_dest_def_info,
7311                                     new_stmt);
7312         }
7313       else
7314         vect_finish_stmt_generation (loop_vinfo,
7315                                      scalar_dest_def_info,
7316                                      new_stmt, gsi);
7317
7318       if (slp_node)
7319         slp_node->push_vec_def (new_stmt);
7320       else
7321         {
7322           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7323           *vec_stmt = new_stmt;
7324         }
7325     }
7326
7327   return true;
7328 }
7329
7330 /* Function is_nonwrapping_integer_induction.
7331
7332    Check if STMT_VINO (which is part of loop LOOP) both increments and
7333    does not cause overflow.  */
7334
7335 static bool
7336 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
7337 {
7338   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
7339   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
7340   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
7341   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
7342   widest_int ni, max_loop_value, lhs_max;
7343   wi::overflow_type overflow = wi::OVF_NONE;
7344
7345   /* Make sure the loop is integer based.  */
7346   if (TREE_CODE (base) != INTEGER_CST
7347       || TREE_CODE (step) != INTEGER_CST)
7348     return false;
7349
7350   /* Check that the max size of the loop will not wrap.  */
7351
7352   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
7353     return true;
7354
7355   if (! max_stmt_executions (loop, &ni))
7356     return false;
7357
7358   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
7359                             &overflow);
7360   if (overflow)
7361     return false;
7362
7363   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
7364                             TYPE_SIGN (lhs_type), &overflow);
7365   if (overflow)
7366     return false;
7367
7368   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
7369           <= TYPE_PRECISION (lhs_type));
7370 }
7371
7372 /* Check if masking can be supported by inserting a conditional expression.
7373    CODE is the code for the operation.  COND_FN is the conditional internal
7374    function, if it exists.  VECTYPE_IN is the type of the vector input.  */
7375 static bool
7376 use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
7377                          tree vectype_in)
7378 {
7379   if (cond_fn != IFN_LAST
7380       && direct_internal_fn_supported_p (cond_fn, vectype_in,
7381                                          OPTIMIZE_FOR_SPEED))
7382     return false;
7383
7384   if (code.is_tree_code ())
7385     switch (tree_code (code))
7386       {
7387       case DOT_PROD_EXPR:
7388       case SAD_EXPR:
7389         return true;
7390
7391       default:
7392         break;
7393       }
7394   return false;
7395 }
7396
7397 /* Insert a conditional expression to enable masked vectorization.  CODE is the
7398    code for the operation.  VOP is the array of operands.  MASK is the loop
7399    mask.  GSI is a statement iterator used to place the new conditional
7400    expression.  */
7401 static void
7402 build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
7403                       gimple_stmt_iterator *gsi)
7404 {
7405   switch (tree_code (code))
7406     {
7407     case DOT_PROD_EXPR:
7408       {
7409         tree vectype = TREE_TYPE (vop[1]);
7410         tree zero = build_zero_cst (vectype);
7411         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7412         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7413                                                mask, vop[1], zero);
7414         gsi_insert_before (gsi, select, GSI_SAME_STMT);
7415         vop[1] = masked_op1;
7416         break;
7417       }
7418
7419     case SAD_EXPR:
7420       {
7421         tree vectype = TREE_TYPE (vop[1]);
7422         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7423         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7424                                                mask, vop[1], vop[0]);
7425         gsi_insert_before (gsi, select, GSI_SAME_STMT);
7426         vop[1] = masked_op1;
7427         break;
7428       }
7429
7430     default:
7431       gcc_unreachable ();
7432     }
7433 }
7434
7435 /* Function vectorizable_reduction.
7436
7437    Check if STMT_INFO performs a reduction operation that can be vectorized.
7438    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
7439    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
7440    Return true if STMT_INFO is vectorizable in this way.
7441
7442    This function also handles reduction idioms (patterns) that have been
7443    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
7444    may be of this form:
7445      X = pattern_expr (arg0, arg1, ..., X)
7446    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
7447    sequence that had been detected and replaced by the pattern-stmt
7448    (STMT_INFO).
7449
7450    This function also handles reduction of condition expressions, for example:
7451      for (int i = 0; i < N; i++)
7452        if (a[i] < value)
7453          last = a[i];
7454    This is handled by vectorising the loop and creating an additional vector
7455    containing the loop indexes for which "a[i] < value" was true.  In the
7456    function epilogue this is reduced to a single max value and then used to
7457    index into the vector of results.
7458
7459    In some cases of reduction patterns, the type of the reduction variable X is
7460    different than the type of the other arguments of STMT_INFO.
7461    In such cases, the vectype that is used when transforming STMT_INFO into
7462    a vector stmt is different than the vectype that is used to determine the
7463    vectorization factor, because it consists of a different number of elements
7464    than the actual number of elements that are being operated upon in parallel.
7465
7466    For example, consider an accumulation of shorts into an int accumulator.
7467    On some targets it's possible to vectorize this pattern operating on 8
7468    shorts at a time (hence, the vectype for purposes of determining the
7469    vectorization factor should be V8HI); on the other hand, the vectype that
7470    is used to create the vector form is actually V4SI (the type of the result).
7471
7472    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
7473    indicates what is the actual level of parallelism (V8HI in the example), so
7474    that the right vectorization factor would be derived.  This vectype
7475    corresponds to the type of arguments to the reduction stmt, and should *NOT*
7476    be used to create the vectorized stmt.  The right vectype for the vectorized
7477    stmt is obtained from the type of the result X:
7478       get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7479
7480    This means that, contrary to "regular" reductions (or "regular" stmts in
7481    general), the following equation:
7482       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7483    does *NOT* necessarily hold for reduction patterns.  */
7484
7485 bool
7486 vectorizable_reduction (loop_vec_info loop_vinfo,
7487                         stmt_vec_info stmt_info, slp_tree slp_node,
7488                         slp_instance slp_node_instance,
7489                         stmt_vector_for_cost *cost_vec)
7490 {
7491   tree vectype_in = NULL_TREE;
7492   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7493   enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
7494   stmt_vec_info cond_stmt_vinfo = NULL;
7495   int i;
7496   int ncopies;
7497   bool single_defuse_cycle = false;
7498   bool nested_cycle = false;
7499   bool double_reduc = false;
7500   int vec_num;
7501   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
7502   tree cond_reduc_val = NULL_TREE;
7503
7504   /* Make sure it was already recognized as a reduction computation.  */
7505   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
7506       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
7507       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
7508     return false;
7509
7510   /* The stmt we store reduction analysis meta on.  */
7511   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7512   reduc_info->is_reduc_info = true;
7513
7514   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
7515     {
7516       if (is_a <gphi *> (stmt_info->stmt))
7517         {
7518           if (slp_node)
7519             {
7520               /* We eventually need to set a vector type on invariant
7521                  arguments.  */
7522               unsigned j;
7523               slp_tree child;
7524               FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
7525                 if (!vect_maybe_update_slp_op_vectype
7526                        (child, SLP_TREE_VECTYPE (slp_node)))
7527                   {
7528                     if (dump_enabled_p ())
7529                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7530                                        "incompatible vector types for "
7531                                        "invariants\n");
7532                     return false;
7533                   }
7534             }
7535           /* Analysis for double-reduction is done on the outer
7536              loop PHI, nested cycles have no further restrictions.  */
7537           STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
7538         }
7539       else
7540         STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7541       return true;
7542     }
7543
7544   stmt_vec_info orig_stmt_of_analysis = stmt_info;
7545   stmt_vec_info phi_info = stmt_info;
7546   if (!is_a <gphi *> (stmt_info->stmt))
7547     {
7548       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7549       return true;
7550     }
7551   if (slp_node)
7552     {
7553       slp_node_instance->reduc_phis = slp_node;
7554       /* ???  We're leaving slp_node to point to the PHIs, we only
7555          need it to get at the number of vector stmts which wasn't
7556          yet initialized for the instance root.  */
7557     }
7558   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7559     {
7560       use_operand_p use_p;
7561       gimple *use_stmt;
7562       bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
7563                                  &use_p, &use_stmt);
7564       gcc_assert (res);
7565       phi_info = loop_vinfo->lookup_stmt (use_stmt);
7566     }
7567
7568   /* PHIs should not participate in patterns.  */
7569   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7570   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7571
7572   /* Verify following REDUC_IDX from the latch def leads us back to the PHI
7573      and compute the reduction chain length.  Discover the real
7574      reduction operation stmt on the way (stmt_info and slp_for_stmt_info).  */
7575   tree reduc_def
7576     = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
7577                              loop_latch_edge
7578                                (gimple_bb (reduc_def_phi)->loop_father));
7579   unsigned reduc_chain_length = 0;
7580   bool only_slp_reduc_chain = true;
7581   stmt_info = NULL;
7582   slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
7583   while (reduc_def != PHI_RESULT (reduc_def_phi))
7584     {
7585       stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
7586       stmt_vec_info vdef = vect_stmt_to_vectorize (def);
7587       if (STMT_VINFO_REDUC_IDX (vdef) == -1)
7588         {
7589           if (dump_enabled_p ())
7590             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7591                              "reduction chain broken by patterns.\n");
7592           return false;
7593         }
7594       if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
7595         only_slp_reduc_chain = false;
7596       /* For epilogue generation live members of the chain need
7597          to point back to the PHI via their original stmt for
7598          info_for_reduction to work.  For SLP we need to look at
7599          all lanes here - even though we only will vectorize from
7600          the SLP node with live lane zero the other live lanes also
7601          need to be identified as part of a reduction to be able
7602          to skip code generation for them.  */
7603       if (slp_for_stmt_info)
7604         {
7605           for (auto s : SLP_TREE_SCALAR_STMTS (slp_for_stmt_info))
7606             if (STMT_VINFO_LIVE_P (s))
7607               STMT_VINFO_REDUC_DEF (vect_orig_stmt (s)) = phi_info;
7608         }
7609       else if (STMT_VINFO_LIVE_P (vdef))
7610         STMT_VINFO_REDUC_DEF (def) = phi_info;
7611       gimple_match_op op;
7612       if (!gimple_extract_op (vdef->stmt, &op))
7613         {
7614           if (dump_enabled_p ())
7615             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7616                              "reduction chain includes unsupported"
7617                              " statement type.\n");
7618           return false;
7619         }
7620       if (CONVERT_EXPR_CODE_P (op.code))
7621         {
7622           if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
7623             {
7624               if (dump_enabled_p ())
7625                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7626                                  "conversion in the reduction chain.\n");
7627               return false;
7628             }
7629         }
7630       else if (!stmt_info)
7631         /* First non-conversion stmt.  */
7632         stmt_info = vdef;
7633       reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
7634       reduc_chain_length++;
7635       if (!stmt_info && slp_node)
7636         slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
7637     }
7638   /* PHIs should not participate in patterns.  */
7639   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7640
7641   if (nested_in_vect_loop_p (loop, stmt_info))
7642     {
7643       loop = loop->inner;
7644       nested_cycle = true;
7645     }
7646
7647   /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
7648      element.  */
7649   if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7650     {
7651       gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
7652       stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
7653     }
7654   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7655     gcc_assert (slp_node
7656                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
7657
7658   /* 1. Is vectorizable reduction?  */
7659   /* Not supportable if the reduction variable is used in the loop, unless
7660      it's a reduction chain.  */
7661   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
7662       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7663     return false;
7664
7665   /* Reductions that are not used even in an enclosing outer-loop,
7666      are expected to be "live" (used out of the loop).  */
7667   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
7668       && !STMT_VINFO_LIVE_P (stmt_info))
7669     return false;
7670
7671   /* 2. Has this been recognized as a reduction pattern?
7672
7673      Check if STMT represents a pattern that has been recognized
7674      in earlier analysis stages.  For stmts that represent a pattern,
7675      the STMT_VINFO_RELATED_STMT field records the last stmt in
7676      the original sequence that constitutes the pattern.  */
7677
7678   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
7679   if (orig_stmt_info)
7680     {
7681       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
7682       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
7683     }
7684
7685   /* 3. Check the operands of the operation.  The first operands are defined
7686         inside the loop body. The last operand is the reduction variable,
7687         which is defined by the loop-header-phi.  */
7688
7689   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7690   STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
7691   gimple_match_op op;
7692   if (!gimple_extract_op (stmt_info->stmt, &op))
7693     gcc_unreachable ();
7694   bool lane_reduc_code_p = (op.code == DOT_PROD_EXPR
7695                             || op.code == WIDEN_SUM_EXPR
7696                             || op.code == SAD_EXPR);
7697
7698   if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
7699       && !SCALAR_FLOAT_TYPE_P (op.type))
7700     return false;
7701
7702   /* Do not try to vectorize bit-precision reductions.  */
7703   if (!type_has_mode_precision_p (op.type))
7704     return false;
7705
7706   /* For lane-reducing ops we're reducing the number of reduction PHIs
7707      which means the only use of that may be in the lane-reducing operation.  */
7708   if (lane_reduc_code_p
7709       && reduc_chain_length != 1
7710       && !only_slp_reduc_chain)
7711     {
7712       if (dump_enabled_p ())
7713         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7714                          "lane-reducing reduction with extra stmts.\n");
7715       return false;
7716     }
7717
7718   /* All uses but the last are expected to be defined in the loop.
7719      The last use is the reduction variable.  In case of nested cycle this
7720      assumption is not true: we use reduc_index to record the index of the
7721      reduction variable.  */
7722   slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
7723   tree *vectype_op = XALLOCAVEC (tree, op.num_ops);
7724   /* We need to skip an extra operand for COND_EXPRs with embedded
7725      comparison.  */
7726   unsigned opno_adjust = 0;
7727   if (op.code == COND_EXPR && COMPARISON_CLASS_P (op.ops[0]))
7728     opno_adjust = 1;
7729   for (i = 0; i < (int) op.num_ops; i++)
7730     {
7731       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
7732       if (i == 0 && op.code == COND_EXPR)
7733         continue;
7734
7735       stmt_vec_info def_stmt_info;
7736       enum vect_def_type dt;
7737       if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
7738                                i + opno_adjust, &op.ops[i], &slp_op[i], &dt,
7739                                &vectype_op[i], &def_stmt_info))
7740         {
7741           if (dump_enabled_p ())
7742             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7743                              "use not simple.\n");
7744           return false;
7745         }
7746       if (i == STMT_VINFO_REDUC_IDX (stmt_info))
7747         continue;
7748
7749       /* For an IFN_COND_OP we might hit the reduction definition operand
7750          twice (once as definition, once as else).  */
7751       if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
7752         continue;
7753
7754       /* There should be only one cycle def in the stmt, the one
7755          leading to reduc_def.  */
7756       if (VECTORIZABLE_CYCLE_DEF (dt))
7757         return false;
7758
7759       if (!vectype_op[i])
7760         vectype_op[i]
7761           = get_vectype_for_scalar_type (loop_vinfo,
7762                                          TREE_TYPE (op.ops[i]), slp_op[i]);
7763
7764       /* To properly compute ncopies we are interested in the widest
7765          non-reduction input type in case we're looking at a widening
7766          accumulation that we later handle in vect_transform_reduction.  */
7767       if (lane_reduc_code_p
7768           && vectype_op[i]
7769           && (!vectype_in
7770               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
7771                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_op[i]))))))
7772         vectype_in = vectype_op[i];
7773
7774       /* Record how the non-reduction-def value of COND_EXPR is defined.
7775          ???  For a chain of multiple CONDs we'd have to match them up all.  */
7776       if (op.code == COND_EXPR && reduc_chain_length == 1)
7777         {
7778           if (dt == vect_constant_def)
7779             {
7780               cond_reduc_dt = dt;
7781               cond_reduc_val = op.ops[i];
7782             }
7783           else if (dt == vect_induction_def
7784                    && def_stmt_info
7785                    && is_nonwrapping_integer_induction (def_stmt_info, loop))
7786             {
7787               cond_reduc_dt = dt;
7788               cond_stmt_vinfo = def_stmt_info;
7789             }
7790         }
7791     }
7792   if (!vectype_in)
7793     vectype_in = STMT_VINFO_VECTYPE (phi_info);
7794   STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
7795
7796   enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
7797   STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
7798   /* If we have a condition reduction, see if we can simplify it further.  */
7799   if (v_reduc_type == COND_REDUCTION)
7800     {
7801       if (slp_node)
7802         return false;
7803
7804       /* When the condition uses the reduction value in the condition, fail.  */
7805       if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
7806         {
7807           if (dump_enabled_p ())
7808             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7809                              "condition depends on previous iteration\n");
7810           return false;
7811         }
7812
7813       if (reduc_chain_length == 1
7814           && (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST, vectype_in,
7815                                               OPTIMIZE_FOR_SPEED)
7816               || direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
7817                                                  vectype_in,
7818                                                  OPTIMIZE_FOR_SPEED)))
7819         {
7820           if (dump_enabled_p ())
7821             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7822                              "optimizing condition reduction with"
7823                              " FOLD_EXTRACT_LAST.\n");
7824           STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
7825         }
7826       else if (cond_reduc_dt == vect_induction_def)
7827         {
7828           tree base
7829             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
7830           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
7831
7832           gcc_assert (TREE_CODE (base) == INTEGER_CST
7833                       && TREE_CODE (step) == INTEGER_CST);
7834           cond_reduc_val = NULL_TREE;
7835           enum tree_code cond_reduc_op_code = ERROR_MARK;
7836           tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
7837           if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
7838             ;
7839           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
7840              above base; punt if base is the minimum value of the type for
7841              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
7842           else if (tree_int_cst_sgn (step) == -1)
7843             {
7844               cond_reduc_op_code = MIN_EXPR;
7845               if (tree_int_cst_sgn (base) == -1)
7846                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7847               else if (tree_int_cst_lt (base,
7848                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
7849                 cond_reduc_val
7850                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
7851             }
7852           else
7853             {
7854               cond_reduc_op_code = MAX_EXPR;
7855               if (tree_int_cst_sgn (base) == 1)
7856                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7857               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
7858                                         base))
7859                 cond_reduc_val
7860                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
7861             }
7862           if (cond_reduc_val)
7863             {
7864               if (dump_enabled_p ())
7865                 dump_printf_loc (MSG_NOTE, vect_location,
7866                                  "condition expression based on "
7867                                  "integer induction.\n");
7868               STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
7869               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
7870                 = cond_reduc_val;
7871               STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
7872             }
7873         }
7874       else if (cond_reduc_dt == vect_constant_def)
7875         {
7876           enum vect_def_type cond_initial_dt;
7877           tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
7878           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
7879           if (cond_initial_dt == vect_constant_def
7880               && types_compatible_p (TREE_TYPE (cond_initial_val),
7881                                      TREE_TYPE (cond_reduc_val)))
7882             {
7883               tree e = fold_binary (LE_EXPR, boolean_type_node,
7884                                     cond_initial_val, cond_reduc_val);
7885               if (e && (integer_onep (e) || integer_zerop (e)))
7886                 {
7887                   if (dump_enabled_p ())
7888                     dump_printf_loc (MSG_NOTE, vect_location,
7889                                      "condition expression based on "
7890                                      "compile time constant.\n");
7891                   /* Record reduction code at analysis stage.  */
7892                   STMT_VINFO_REDUC_CODE (reduc_info)
7893                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
7894                   STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
7895                 }
7896             }
7897         }
7898     }
7899
7900   if (STMT_VINFO_LIVE_P (phi_info))
7901     return false;
7902
7903   if (slp_node)
7904     ncopies = 1;
7905   else
7906     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7907
7908   gcc_assert (ncopies >= 1);
7909
7910   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
7911
7912   if (nested_cycle)
7913     {
7914       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
7915                   == vect_double_reduction_def);
7916       double_reduc = true;
7917     }
7918
7919   /* 4.2. Check support for the epilog operation.
7920
7921           If STMT represents a reduction pattern, then the type of the
7922           reduction variable may be different than the type of the rest
7923           of the arguments.  For example, consider the case of accumulation
7924           of shorts into an int accumulator; The original code:
7925                         S1: int_a = (int) short_a;
7926           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
7927
7928           was replaced with:
7929                         STMT: int_acc = widen_sum <short_a, int_acc>
7930
7931           This means that:
7932           1. The tree-code that is used to create the vector operation in the
7933              epilog code (that reduces the partial results) is not the
7934              tree-code of STMT, but is rather the tree-code of the original
7935              stmt from the pattern that STMT is replacing.  I.e, in the example
7936              above we want to use 'widen_sum' in the loop, but 'plus' in the
7937              epilog.
7938           2. The type (mode) we use to check available target support
7939              for the vector operation to be created in the *epilog*, is
7940              determined by the type of the reduction variable (in the example
7941              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7942              However the type (mode) we use to check available target support
7943              for the vector operation to be created *inside the loop*, is
7944              determined by the type of the other arguments to STMT (in the
7945              example we'd check this: optab_handler (widen_sum_optab,
7946              vect_short_mode)).
7947
7948           This is contrary to "regular" reductions, in which the types of all
7949           the arguments are the same as the type of the reduction variable.
7950           For "regular" reductions we can therefore use the same vector type
7951           (and also the same tree-code) when generating the epilog code and
7952           when generating the code inside the loop.  */
7953
7954   code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
7955
7956   /* If conversion might have created a conditional operation like
7957      IFN_COND_ADD already.  Use the internal code for the following checks.  */
7958   if (orig_code.is_internal_fn ())
7959     {
7960       tree_code new_code = conditional_internal_fn_code (internal_fn (orig_code));
7961       orig_code = new_code != ERROR_MARK ? new_code : orig_code;
7962     }
7963
7964   STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
7965
7966   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7967   if (reduction_type == TREE_CODE_REDUCTION)
7968     {
7969       /* Check whether it's ok to change the order of the computation.
7970          Generally, when vectorizing a reduction we change the order of the
7971          computation.  This may change the behavior of the program in some
7972          cases, so we need to check that this is ok.  One exception is when
7973          vectorizing an outer-loop: the inner-loop is executed sequentially,
7974          and therefore vectorizing reductions in the inner-loop during
7975          outer-loop vectorization is safe.  Likewise when we are vectorizing
7976          a series of reductions using SLP and the VF is one the reductions
7977          are performed in scalar order.  */
7978       if (slp_node
7979           && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7980           && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
7981         ;
7982       else if (needs_fold_left_reduction_p (op.type, orig_code))
7983         {
7984           /* When vectorizing a reduction chain w/o SLP the reduction PHI
7985              is not directy used in stmt.  */
7986           if (!only_slp_reduc_chain
7987               && reduc_chain_length != 1)
7988             {
7989               if (dump_enabled_p ())
7990                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7991                                  "in-order reduction chain without SLP.\n");
7992               return false;
7993             }
7994           STMT_VINFO_REDUC_TYPE (reduc_info)
7995             = reduction_type = FOLD_LEFT_REDUCTION;
7996         }
7997       else if (!commutative_binary_op_p (orig_code, op.type)
7998                || !associative_binary_op_p (orig_code, op.type))
7999         {
8000           if (dump_enabled_p ())
8001             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8002                             "reduction: not commutative/associative\n");
8003           return false;
8004         }
8005     }
8006
8007   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
8008       && ncopies > 1)
8009     {
8010       if (dump_enabled_p ())
8011         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8012                          "multiple types in double reduction or condition "
8013                          "reduction or fold-left reduction.\n");
8014       return false;
8015     }
8016
8017   internal_fn reduc_fn = IFN_LAST;
8018   if (reduction_type == TREE_CODE_REDUCTION
8019       || reduction_type == FOLD_LEFT_REDUCTION
8020       || reduction_type == INTEGER_INDUC_COND_REDUCTION
8021       || reduction_type == CONST_COND_REDUCTION)
8022     {
8023       if (reduction_type == FOLD_LEFT_REDUCTION
8024           ? fold_left_reduction_fn (orig_code, &reduc_fn)
8025           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
8026         {
8027           if (reduc_fn != IFN_LAST
8028               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
8029                                                   OPTIMIZE_FOR_SPEED))
8030             {
8031               if (dump_enabled_p ())
8032                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8033                                  "reduc op not supported by target.\n");
8034
8035               reduc_fn = IFN_LAST;
8036             }
8037         }
8038       else
8039         {
8040           if (!nested_cycle || double_reduc)
8041             {
8042               if (dump_enabled_p ())
8043                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8044                                  "no reduc code for scalar code.\n");
8045
8046               return false;
8047             }
8048         }
8049     }
8050   else if (reduction_type == COND_REDUCTION)
8051     {
8052       int scalar_precision
8053         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
8054       cr_index_scalar_type = make_unsigned_type (scalar_precision);
8055       cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
8056                                                 vectype_out);
8057
8058       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
8059                                           OPTIMIZE_FOR_SPEED))
8060         reduc_fn = IFN_REDUC_MAX;
8061     }
8062   STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
8063
8064   if (reduction_type != EXTRACT_LAST_REDUCTION
8065       && (!nested_cycle || double_reduc)
8066       && reduc_fn == IFN_LAST
8067       && !nunits_out.is_constant ())
8068     {
8069       if (dump_enabled_p ())
8070         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8071                          "missing target support for reduction on"
8072                          " variable-length vectors.\n");
8073       return false;
8074     }
8075
8076   /* For SLP reductions, see if there is a neutral value we can use.  */
8077   tree neutral_op = NULL_TREE;
8078   if (slp_node)
8079     {
8080       tree initial_value = NULL_TREE;
8081       if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
8082         initial_value = vect_phi_initial_value (reduc_def_phi);
8083       neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8084                                              orig_code, initial_value);
8085     }
8086
8087   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
8088     {
8089       /* We can't support in-order reductions of code such as this:
8090
8091            for (int i = 0; i < n1; ++i)
8092              for (int j = 0; j < n2; ++j)
8093                l += a[j];
8094
8095          since GCC effectively transforms the loop when vectorizing:
8096
8097            for (int i = 0; i < n1 / VF; ++i)
8098              for (int j = 0; j < n2; ++j)
8099                for (int k = 0; k < VF; ++k)
8100                  l += a[j];
8101
8102          which is a reassociation of the original operation.  */
8103       if (dump_enabled_p ())
8104         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8105                          "in-order double reduction not supported.\n");
8106
8107       return false;
8108     }
8109
8110   if (reduction_type == FOLD_LEFT_REDUCTION
8111       && slp_node
8112       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
8113     {
8114       /* We cannot use in-order reductions in this case because there is
8115          an implicit reassociation of the operations involved.  */
8116       if (dump_enabled_p ())
8117         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8118                          "in-order unchained SLP reductions not supported.\n");
8119       return false;
8120     }
8121
8122   /* For double reductions, and for SLP reductions with a neutral value,
8123      we construct a variable-length initial vector by loading a vector
8124      full of the neutral value and then shift-and-inserting the start
8125      values into the low-numbered elements.  */
8126   if ((double_reduc || neutral_op)
8127       && !nunits_out.is_constant ()
8128       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
8129                                           vectype_out, OPTIMIZE_FOR_SPEED))
8130     {
8131       if (dump_enabled_p ())
8132         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8133                          "reduction on variable-length vectors requires"
8134                          " target support for a vector-shift-and-insert"
8135                          " operation.\n");
8136       return false;
8137     }
8138
8139   /* Check extra constraints for variable-length unchained SLP reductions.  */
8140   if (slp_node
8141       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
8142       && !nunits_out.is_constant ())
8143     {
8144       /* We checked above that we could build the initial vector when
8145          there's a neutral element value.  Check here for the case in
8146          which each SLP statement has its own initial value and in which
8147          that value needs to be repeated for every instance of the
8148          statement within the initial vector.  */
8149       unsigned int group_size = SLP_TREE_LANES (slp_node);
8150       if (!neutral_op
8151           && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
8152                                               TREE_TYPE (vectype_out)))
8153         {
8154           if (dump_enabled_p ())
8155             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8156                              "unsupported form of SLP reduction for"
8157                              " variable-length vectors: cannot build"
8158                              " initial vector.\n");
8159           return false;
8160         }
8161       /* The epilogue code relies on the number of elements being a multiple
8162          of the group size.  The duplicate-and-interleave approach to setting
8163          up the initial vector does too.  */
8164       if (!multiple_p (nunits_out, group_size))
8165         {
8166           if (dump_enabled_p ())
8167             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8168                              "unsupported form of SLP reduction for"
8169                              " variable-length vectors: the vector size"
8170                              " is not a multiple of the number of results.\n");
8171           return false;
8172         }
8173     }
8174
8175   if (reduction_type == COND_REDUCTION)
8176     {
8177       widest_int ni;
8178
8179       if (! max_loop_iterations (loop, &ni))
8180         {
8181           if (dump_enabled_p ())
8182             dump_printf_loc (MSG_NOTE, vect_location,
8183                              "loop count not known, cannot create cond "
8184                              "reduction.\n");
8185           return false;
8186         }
8187       /* Convert backedges to iterations.  */
8188       ni += 1;
8189
8190       /* The additional index will be the same type as the condition.  Check
8191          that the loop can fit into this less one (because we'll use up the
8192          zero slot for when there are no matches).  */
8193       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
8194       if (wi::geu_p (ni, wi::to_widest (max_index)))
8195         {
8196           if (dump_enabled_p ())
8197             dump_printf_loc (MSG_NOTE, vect_location,
8198                              "loop size is greater than data size.\n");
8199           return false;
8200         }
8201     }
8202
8203   /* In case the vectorization factor (VF) is bigger than the number
8204      of elements that we can fit in a vectype (nunits), we have to generate
8205      more than one vector stmt - i.e - we need to "unroll" the
8206      vector stmt by a factor VF/nunits.  For more details see documentation
8207      in vectorizable_operation.  */
8208
8209   /* If the reduction is used in an outer loop we need to generate
8210      VF intermediate results, like so (e.g. for ncopies=2):
8211         r0 = phi (init, r0)
8212         r1 = phi (init, r1)
8213         r0 = x0 + r0;
8214         r1 = x1 + r1;
8215     (i.e. we generate VF results in 2 registers).
8216     In this case we have a separate def-use cycle for each copy, and therefore
8217     for each copy we get the vector def for the reduction variable from the
8218     respective phi node created for this copy.
8219
8220     Otherwise (the reduction is unused in the loop nest), we can combine
8221     together intermediate results, like so (e.g. for ncopies=2):
8222         r = phi (init, r)
8223         r = x0 + r;
8224         r = x1 + r;
8225    (i.e. we generate VF/2 results in a single register).
8226    In this case for each copy we get the vector def for the reduction variable
8227    from the vectorized reduction operation generated in the previous iteration.
8228
8229    This only works when we see both the reduction PHI and its only consumer
8230    in vectorizable_reduction and there are no intermediate stmts
8231    participating.  When unrolling we want each unrolled iteration to have its
8232    own reduction accumulator since one of the main goals of unrolling a
8233    reduction is to reduce the aggregate loop-carried latency.  */
8234   if (ncopies > 1
8235       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
8236       && reduc_chain_length == 1
8237       && loop_vinfo->suggested_unroll_factor == 1)
8238     single_defuse_cycle = true;
8239
8240   if (single_defuse_cycle || lane_reduc_code_p)
8241     {
8242       gcc_assert (op.code != COND_EXPR);
8243
8244       /* 4. Supportable by target?  */
8245       bool ok = true;
8246
8247       /* 4.1. check support for the operation in the loop
8248
8249          This isn't necessary for the lane reduction codes, since they
8250          can only be produced by pattern matching, and it's up to the
8251          pattern matcher to test for support.  The main reason for
8252          specifically skipping this step is to avoid rechecking whether
8253          mixed-sign dot-products can be implemented using signed
8254          dot-products.  */
8255       machine_mode vec_mode = TYPE_MODE (vectype_in);
8256       if (!lane_reduc_code_p
8257           && !directly_supported_p (op.code, vectype_in, optab_vector))
8258         {
8259           if (dump_enabled_p ())
8260             dump_printf (MSG_NOTE, "op not supported by target.\n");
8261           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
8262               || !vect_can_vectorize_without_simd_p (op.code))
8263             ok = false;
8264           else
8265             if (dump_enabled_p ())
8266               dump_printf (MSG_NOTE, "proceeding using word mode.\n");
8267         }
8268
8269       if (vect_emulated_vector_p (vectype_in)
8270           && !vect_can_vectorize_without_simd_p (op.code))
8271         {
8272           if (dump_enabled_p ())
8273             dump_printf (MSG_NOTE, "using word mode not possible.\n");
8274           return false;
8275         }
8276
8277       /* lane-reducing operations have to go through vect_transform_reduction.
8278          For the other cases try without the single cycle optimization.  */
8279       if (!ok)
8280         {
8281           if (lane_reduc_code_p)
8282             return false;
8283           else
8284             single_defuse_cycle = false;
8285         }
8286     }
8287   STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
8288
8289   /* If the reduction stmt is one of the patterns that have lane
8290      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
8291   if ((ncopies > 1 && ! single_defuse_cycle)
8292       && lane_reduc_code_p)
8293     {
8294       if (dump_enabled_p ())
8295         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8296                          "multi def-use cycle not possible for lane-reducing "
8297                          "reduction operation\n");
8298       return false;
8299     }
8300
8301   if (slp_node
8302       && !(!single_defuse_cycle
8303            && !lane_reduc_code_p
8304            && reduction_type != FOLD_LEFT_REDUCTION))
8305     for (i = 0; i < (int) op.num_ops; i++)
8306       if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
8307         {
8308           if (dump_enabled_p ())
8309             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8310                              "incompatible vector types for invariants\n");
8311           return false;
8312         }
8313
8314   if (slp_node)
8315     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8316   else
8317     vec_num = 1;
8318
8319   vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
8320                              reduction_type, ncopies, cost_vec);
8321   /* Cost the reduction op inside the loop if transformed via
8322      vect_transform_reduction.  Otherwise this is costed by the
8323      separate vectorizable_* routines.  */
8324   if (single_defuse_cycle || lane_reduc_code_p)
8325     {
8326       int factor = 1;
8327       if (vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info))
8328         /* Three dot-products and a subtraction.  */
8329         factor = 4;
8330       record_stmt_cost (cost_vec, ncopies * factor, vector_stmt,
8331                         stmt_info, 0, vect_body);
8332     }
8333
8334   if (dump_enabled_p ()
8335       && reduction_type == FOLD_LEFT_REDUCTION)
8336     dump_printf_loc (MSG_NOTE, vect_location,
8337                      "using an in-order (fold-left) reduction.\n");
8338   STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
8339   /* All but single defuse-cycle optimized, lane-reducing and fold-left
8340      reductions go through their own vectorizable_* routines.  */
8341   if (!single_defuse_cycle
8342       && !lane_reduc_code_p
8343       && reduction_type != FOLD_LEFT_REDUCTION)
8344     {
8345       stmt_vec_info tem
8346         = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
8347       if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
8348         {
8349           gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
8350           tem = REDUC_GROUP_FIRST_ELEMENT (tem);
8351         }
8352       STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
8353       STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
8354     }
8355   else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8356     {
8357       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8358       vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8359       internal_fn cond_fn = get_conditional_internal_fn (op.code, op.type);
8360
8361       if (reduction_type != FOLD_LEFT_REDUCTION
8362           && !use_mask_by_cond_expr_p (op.code, cond_fn, vectype_in)
8363           && (cond_fn == IFN_LAST
8364               || !direct_internal_fn_supported_p (cond_fn, vectype_in,
8365                                                   OPTIMIZE_FOR_SPEED)))
8366         {
8367           if (dump_enabled_p ())
8368             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8369                              "can't operate on partial vectors because"
8370                              " no conditional operation is available.\n");
8371           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8372         }
8373       else if (reduction_type == FOLD_LEFT_REDUCTION
8374                && reduc_fn == IFN_LAST
8375                && !expand_vec_cond_expr_p (vectype_in,
8376                                            truth_type_for (vectype_in),
8377                                            SSA_NAME))
8378         {
8379           if (dump_enabled_p ())
8380             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8381                              "can't operate on partial vectors because"
8382                              " no conditional operation is available.\n");
8383           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8384         }
8385       else if (reduction_type == FOLD_LEFT_REDUCTION
8386                && internal_fn_mask_index (reduc_fn) == -1
8387                && FLOAT_TYPE_P (vectype_in)
8388                && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in))
8389         {
8390           if (dump_enabled_p ())
8391             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8392                              "can't operate on partial vectors because"
8393                              " signed zeros cannot be preserved.\n");
8394           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8395         }
8396       else
8397         {
8398           internal_fn mask_reduc_fn
8399             = get_masked_reduction_fn (reduc_fn, vectype_in);
8400
8401           if (mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
8402             vect_record_loop_len (loop_vinfo, lens, ncopies * vec_num,
8403                                   vectype_in, 1);
8404           else
8405             vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
8406                                    vectype_in, NULL);
8407         }
8408     }
8409   return true;
8410 }
8411
8412 /* STMT_INFO is a dot-product reduction whose multiplication operands
8413    have different signs.  Emit a sequence to emulate the operation
8414    using a series of signed DOT_PROD_EXPRs and return the last
8415    statement generated.  VEC_DEST is the result of the vector operation
8416    and VOP lists its inputs.  */
8417
8418 static gassign *
8419 vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8420                              gimple_stmt_iterator *gsi, tree vec_dest,
8421                              tree vop[3])
8422 {
8423   tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest));
8424   tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0]));
8425   tree narrow_elttype = TREE_TYPE (narrow_vectype);
8426   gimple *new_stmt;
8427
8428   /* Make VOP[0] the unsigned operand VOP[1] the signed operand.  */
8429   if (!TYPE_UNSIGNED (TREE_TYPE (vop[0])))
8430     std::swap (vop[0], vop[1]);
8431
8432   /* Convert all inputs to signed types.  */
8433   for (int i = 0; i < 3; ++i)
8434     if (TYPE_UNSIGNED (TREE_TYPE (vop[i])))
8435       {
8436         tree tmp = make_ssa_name (signed_type_for (TREE_TYPE (vop[i])));
8437         new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]);
8438         vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8439         vop[i] = tmp;
8440       }
8441
8442   /* In the comments below we assume 8-bit inputs for simplicity,
8443      but the approach works for any full integer type.  */
8444
8445   /* Create a vector of -128.  */
8446   tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype);
8447   tree min_narrow = build_vector_from_val (narrow_vectype,
8448                                            min_narrow_elttype);
8449
8450   /* Create a vector of 64.  */
8451   auto half_wi = wi::lrshift (wi::to_wide (min_narrow_elttype), 1);
8452   tree half_narrow = wide_int_to_tree (narrow_elttype, half_wi);
8453   half_narrow = build_vector_from_val (narrow_vectype, half_narrow);
8454
8455   /* Emit: SUB_RES = VOP[0] - 128.  */
8456   tree sub_res = make_ssa_name (narrow_vectype);
8457   new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow);
8458   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8459
8460   /* Emit:
8461
8462        STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
8463        STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
8464        STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
8465
8466      on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
8467      Doing the two 64 * y steps first allows more time to compute x.  */
8468   tree stage1 = make_ssa_name (wide_vectype);
8469   new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR,
8470                                   vop[1], half_narrow, vop[2]);
8471   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8472
8473   tree stage2 = make_ssa_name (wide_vectype);
8474   new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR,
8475                                   vop[1], half_narrow, stage1);
8476   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8477
8478   tree stage3 = make_ssa_name (wide_vectype);
8479   new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR,
8480                                   sub_res, vop[1], stage2);
8481   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8482
8483   /* Convert STAGE3 to the reduction type.  */
8484   return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3);
8485 }
8486
8487 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
8488    value.  */
8489
8490 bool
8491 vect_transform_reduction (loop_vec_info loop_vinfo,
8492                           stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
8493                           gimple **vec_stmt, slp_tree slp_node)
8494 {
8495   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8496   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8497   int i;
8498   int ncopies;
8499   int vec_num;
8500
8501   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8502   gcc_assert (reduc_info->is_reduc_info);
8503
8504   if (nested_in_vect_loop_p (loop, stmt_info))
8505     {
8506       loop = loop->inner;
8507       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
8508     }
8509
8510   gimple_match_op op;
8511   if (!gimple_extract_op (stmt_info->stmt, &op))
8512     gcc_unreachable ();
8513
8514   /* All uses but the last are expected to be defined in the loop.
8515      The last use is the reduction variable.  In case of nested cycle this
8516      assumption is not true: we use reduc_index to record the index of the
8517      reduction variable.  */
8518   stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
8519   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
8520   int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
8521   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8522
8523   if (slp_node)
8524     {
8525       ncopies = 1;
8526       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8527     }
8528   else
8529     {
8530       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8531       vec_num = 1;
8532     }
8533
8534   code_helper code = canonicalize_code (op.code, op.type);
8535   internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
8536
8537   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8538   vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8539   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
8540
8541   /* Transform.  */
8542   tree new_temp = NULL_TREE;
8543   auto_vec<tree> vec_oprnds0;
8544   auto_vec<tree> vec_oprnds1;
8545   auto_vec<tree> vec_oprnds2;
8546   tree def0;
8547
8548   if (dump_enabled_p ())
8549     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
8550
8551   /* FORNOW: Multiple types are not supported for condition.  */
8552   if (code == COND_EXPR)
8553     gcc_assert (ncopies == 1);
8554
8555   /* A binary COND_OP reduction must have the same definition and else
8556      value. */
8557   bool cond_fn_p = code.is_internal_fn ()
8558     && conditional_internal_fn_code (internal_fn (code)) != ERROR_MARK;
8559   if (cond_fn_p)
8560     {
8561       gcc_assert (code == IFN_COND_ADD || code == IFN_COND_SUB
8562                   || code == IFN_COND_MUL || code == IFN_COND_AND
8563                   || code == IFN_COND_IOR || code == IFN_COND_XOR);
8564       gcc_assert (op.num_ops == 4
8565                   && (op.ops[reduc_index]
8566                       == op.ops[internal_fn_else_index ((internal_fn) code)]));
8567     }
8568
8569   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8570
8571   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
8572   if (reduction_type == FOLD_LEFT_REDUCTION)
8573     {
8574       internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
8575       gcc_assert (code.is_tree_code () || cond_fn_p);
8576       return vectorize_fold_left_reduction
8577           (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
8578            code, reduc_fn, op.ops, op.num_ops, vectype_in,
8579            reduc_index, masks, lens);
8580     }
8581
8582   bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
8583   gcc_assert (single_defuse_cycle
8584               || code == DOT_PROD_EXPR
8585               || code == WIDEN_SUM_EXPR
8586               || code == SAD_EXPR);
8587
8588   /* Create the destination vector  */
8589   tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
8590   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
8591
8592   /* Get NCOPIES vector definitions for all operands except the reduction
8593      definition.  */
8594   if (!cond_fn_p)
8595     {
8596       vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
8597                          single_defuse_cycle && reduc_index == 0
8598                          ? NULL_TREE : op.ops[0], &vec_oprnds0,
8599                          single_defuse_cycle && reduc_index == 1
8600                          ? NULL_TREE : op.ops[1], &vec_oprnds1,
8601                          op.num_ops == 3
8602                          && !(single_defuse_cycle && reduc_index == 2)
8603                          ? op.ops[2] : NULL_TREE, &vec_oprnds2);
8604     }
8605   else
8606     {
8607       /* For a conditional operation pass the truth type as mask
8608          vectype.  */
8609       gcc_assert (single_defuse_cycle
8610                   && (reduc_index == 1 || reduc_index == 2));
8611       vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
8612                          op.ops[0], truth_type_for (vectype_in), &vec_oprnds0,
8613                          reduc_index == 1 ? NULL_TREE : op.ops[1],
8614                          NULL_TREE, &vec_oprnds1,
8615                          reduc_index == 2 ? NULL_TREE : op.ops[2],
8616                          NULL_TREE, &vec_oprnds2);
8617     }
8618
8619   /* For single def-use cycles get one copy of the vectorized reduction
8620      definition.  */
8621   if (single_defuse_cycle)
8622     {
8623       gcc_assert (!slp_node);
8624       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8625                                      op.ops[reduc_index],
8626                                      reduc_index == 0 ? &vec_oprnds0
8627                                      : (reduc_index == 1 ? &vec_oprnds1
8628                                         : &vec_oprnds2));
8629     }
8630
8631   bool emulated_mixed_dot_prod
8632     = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
8633   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
8634     {
8635       gimple *new_stmt;
8636       tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
8637       if (masked_loop_p && !mask_by_cond_expr)
8638         {
8639           /* No conditional ifns have been defined for dot-product yet.  */
8640           gcc_assert (code != DOT_PROD_EXPR);
8641
8642           /* Make sure that the reduction accumulator is vop[0].  */
8643           if (reduc_index == 1)
8644             {
8645               gcc_assert (commutative_binary_op_p (code, op.type));
8646               std::swap (vop[0], vop[1]);
8647             }
8648           tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8649                                           vec_num * ncopies, vectype_in, i);
8650           gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
8651                                                     vop[0], vop[1], vop[0]);
8652           new_temp = make_ssa_name (vec_dest, call);
8653           gimple_call_set_lhs (call, new_temp);
8654           gimple_call_set_nothrow (call, true);
8655           vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
8656           new_stmt = call;
8657         }
8658       else
8659         {
8660           if (op.num_ops >= 3)
8661             vop[2] = vec_oprnds2[i];
8662
8663           if (masked_loop_p && mask_by_cond_expr)
8664             {
8665               tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8666                                               vec_num * ncopies, vectype_in, i);
8667               build_vect_cond_expr (code, vop, mask, gsi);
8668             }
8669
8670           if (emulated_mixed_dot_prod)
8671             new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
8672                                                     vec_dest, vop);
8673
8674           else if (code.is_internal_fn () && !cond_fn_p)
8675             new_stmt = gimple_build_call_internal (internal_fn (code),
8676                                                    op.num_ops,
8677                                                    vop[0], vop[1], vop[2]);
8678           else if (code.is_internal_fn () && cond_fn_p)
8679             new_stmt = gimple_build_call_internal (internal_fn (code),
8680                                                    op.num_ops,
8681                                                    vop[0], vop[1], vop[2],
8682                                                    vop[1]);
8683           else
8684             new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
8685                                             vop[0], vop[1], vop[2]);
8686           new_temp = make_ssa_name (vec_dest, new_stmt);
8687           gimple_set_lhs (new_stmt, new_temp);
8688           vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8689         }
8690
8691       if (slp_node)
8692         slp_node->push_vec_def (new_stmt);
8693       else if (single_defuse_cycle
8694                && i < ncopies - 1)
8695         {
8696           if (reduc_index == 0)
8697             vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
8698           else if (reduc_index == 1)
8699             vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
8700           else if (reduc_index == 2)
8701             vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
8702         }
8703       else
8704         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8705     }
8706
8707   if (!slp_node)
8708     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8709
8710   return true;
8711 }
8712
8713 /* Transform phase of a cycle PHI.  */
8714
8715 bool
8716 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
8717                           stmt_vec_info stmt_info, gimple **vec_stmt,
8718                           slp_tree slp_node, slp_instance slp_node_instance)
8719 {
8720   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8721   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8722   int i;
8723   int ncopies;
8724   int j;
8725   bool nested_cycle = false;
8726   int vec_num;
8727
8728   if (nested_in_vect_loop_p (loop, stmt_info))
8729     {
8730       loop = loop->inner;
8731       nested_cycle = true;
8732     }
8733
8734   stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
8735   reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
8736   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8737   gcc_assert (reduc_info->is_reduc_info);
8738
8739   if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
8740       || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
8741     /* Leave the scalar phi in place.  */
8742     return true;
8743
8744   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8745   /* For a nested cycle we do not fill the above.  */
8746   if (!vectype_in)
8747     vectype_in = STMT_VINFO_VECTYPE (stmt_info);
8748   gcc_assert (vectype_in);
8749
8750   if (slp_node)
8751     {
8752       /* The size vect_schedule_slp_instance computes is off for us.  */
8753       vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
8754                                       * SLP_TREE_LANES (slp_node), vectype_in);
8755       ncopies = 1;
8756     }
8757   else
8758     {
8759       vec_num = 1;
8760       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8761     }
8762
8763   /* Check whether we should use a single PHI node and accumulate
8764      vectors to one before the backedge.  */
8765   if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
8766     ncopies = 1;
8767
8768   /* Create the destination vector  */
8769   gphi *phi = as_a <gphi *> (stmt_info->stmt);
8770   tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
8771                                                vectype_out);
8772
8773   /* Get the loop-entry arguments.  */
8774   tree vec_initial_def = NULL_TREE;
8775   auto_vec<tree> vec_initial_defs;
8776   if (slp_node)
8777     {
8778       vec_initial_defs.reserve (vec_num);
8779       if (nested_cycle)
8780         {
8781           unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
8782           vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
8783                              &vec_initial_defs);
8784         }
8785       else
8786         {
8787           gcc_assert (slp_node == slp_node_instance->reduc_phis);
8788           vec<tree> &initial_values = reduc_info->reduc_initial_values;
8789           vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
8790
8791           unsigned int num_phis = stmts.length ();
8792           if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
8793             num_phis = 1;
8794           initial_values.reserve (num_phis);
8795           for (unsigned int i = 0; i < num_phis; ++i)
8796             {
8797               gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
8798               initial_values.quick_push (vect_phi_initial_value (this_phi));
8799             }
8800           if (vec_num == 1)
8801             vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8802           if (!initial_values.is_empty ())
8803             {
8804               tree initial_value
8805                 = (num_phis == 1 ? initial_values[0] : NULL_TREE);
8806               code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8807               tree neutral_op
8808                 = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8809                                             code, initial_value);
8810               get_initial_defs_for_reduction (loop_vinfo, reduc_info,
8811                                               &vec_initial_defs, vec_num,
8812                                               stmts.length (), neutral_op);
8813             }
8814         }
8815     }
8816   else
8817     {
8818       /* Get at the scalar def before the loop, that defines the initial
8819          value of the reduction variable.  */
8820       tree initial_def = vect_phi_initial_value (phi);
8821       reduc_info->reduc_initial_values.safe_push (initial_def);
8822       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
8823          and we can't use zero for induc_val, use initial_def.  Similarly
8824          for REDUC_MIN and initial_def larger than the base.  */
8825       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
8826         {
8827           tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
8828           if (TREE_CODE (initial_def) == INTEGER_CST
8829               && !integer_zerop (induc_val)
8830               && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
8831                    && tree_int_cst_lt (initial_def, induc_val))
8832                   || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
8833                       && tree_int_cst_lt (induc_val, initial_def))))
8834             {
8835               induc_val = initial_def;
8836               /* Communicate we used the initial_def to epilouge
8837                  generation.  */
8838               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
8839             }
8840           vec_initial_def = build_vector_from_val (vectype_out, induc_val);
8841         }
8842       else if (nested_cycle)
8843         {
8844           /* Do not use an adjustment def as that case is not supported
8845              correctly if ncopies is not one.  */
8846           vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
8847                                          ncopies, initial_def,
8848                                          &vec_initial_defs);
8849         }
8850       else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
8851                || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
8852         /* Fill the initial vector with the initial scalar value.  */
8853         vec_initial_def
8854           = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
8855                                            initial_def, initial_def);
8856       else
8857         {
8858           if (ncopies == 1)
8859             vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8860           if (!reduc_info->reduc_initial_values.is_empty ())
8861             {
8862               initial_def = reduc_info->reduc_initial_values[0];
8863               code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8864               tree neutral_op
8865                 = neutral_op_for_reduction (TREE_TYPE (initial_def),
8866                                             code, initial_def);
8867               gcc_assert (neutral_op);
8868               /* Try to simplify the vector initialization by applying an
8869                  adjustment after the reduction has been performed.  */
8870               if (!reduc_info->reused_accumulator
8871                   && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8872                   && !operand_equal_p (neutral_op, initial_def))
8873                 {
8874                   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
8875                     = initial_def;
8876                   initial_def = neutral_op;
8877                 }
8878               vec_initial_def
8879                 = get_initial_def_for_reduction (loop_vinfo, reduc_info,
8880                                                  initial_def, neutral_op);
8881             }
8882         }
8883     }
8884
8885   if (vec_initial_def)
8886     {
8887       vec_initial_defs.create (ncopies);
8888       for (i = 0; i < ncopies; ++i)
8889         vec_initial_defs.quick_push (vec_initial_def);
8890     }
8891
8892   if (auto *accumulator = reduc_info->reused_accumulator)
8893     {
8894       tree def = accumulator->reduc_input;
8895       if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8896         {
8897           unsigned int nreduc;
8898           bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
8899                                             (TREE_TYPE (def)),
8900                                           TYPE_VECTOR_SUBPARTS (vectype_out),
8901                                           &nreduc);
8902           gcc_assert (res);
8903           gimple_seq stmts = NULL;
8904           /* Reduce the single vector to a smaller one.  */
8905           if (nreduc != 1)
8906             {
8907               /* Perform the reduction in the appropriate type.  */
8908               tree rvectype = vectype_out;
8909               if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
8910                                               TREE_TYPE (TREE_TYPE (def))))
8911                 rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
8912                                               TYPE_VECTOR_SUBPARTS
8913                                                 (vectype_out));
8914               def = vect_create_partial_epilog (def, rvectype,
8915                                                 STMT_VINFO_REDUC_CODE
8916                                                   (reduc_info),
8917                                                 &stmts);
8918             }
8919           /* The epilogue loop might use a different vector mode, like
8920              VNx2DI vs. V2DI.  */
8921           if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
8922             {
8923               tree reduc_type = build_vector_type_for_mode
8924                 (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
8925               def = gimple_convert (&stmts, reduc_type, def);
8926             }
8927           /* Adjust the input so we pick up the partially reduced value
8928              for the skip edge in vect_create_epilog_for_reduction.  */
8929           accumulator->reduc_input = def;
8930           /* And the reduction could be carried out using a different sign.  */
8931           if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8932             def = gimple_convert (&stmts, vectype_out, def);
8933           if (loop_vinfo->main_loop_edge)
8934             {
8935               /* While we'd like to insert on the edge this will split
8936                  blocks and disturb bookkeeping, we also will eventually
8937                  need this on the skip edge.  Rely on sinking to
8938                  fixup optimal placement and insert in the pred.  */
8939               gimple_stmt_iterator gsi
8940                 = gsi_last_bb (loop_vinfo->main_loop_edge->src);
8941               /* Insert before a cond that eventually skips the
8942                  epilogue.  */
8943               if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
8944                 gsi_prev (&gsi);
8945               gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
8946             }
8947           else
8948             gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
8949                                               stmts);
8950         }
8951       if (loop_vinfo->main_loop_edge)
8952         vec_initial_defs[0]
8953           = vect_get_main_loop_result (loop_vinfo, def,
8954                                        vec_initial_defs[0]);
8955       else
8956         vec_initial_defs.safe_push (def);
8957     }
8958
8959   /* Generate the reduction PHIs upfront.  */
8960   for (i = 0; i < vec_num; i++)
8961     {
8962       tree vec_init_def = vec_initial_defs[i];
8963       for (j = 0; j < ncopies; j++)
8964         {
8965           /* Create the reduction-phi that defines the reduction
8966              operand.  */
8967           gphi *new_phi = create_phi_node (vec_dest, loop->header);
8968
8969           /* Set the loop-entry arg of the reduction-phi.  */
8970           if (j != 0 && nested_cycle)
8971             vec_init_def = vec_initial_defs[j];
8972           add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
8973                        UNKNOWN_LOCATION);
8974
8975           /* The loop-latch arg is set in epilogue processing.  */
8976
8977           if (slp_node)
8978             slp_node->push_vec_def (new_phi);
8979           else
8980             {
8981               if (j == 0)
8982                 *vec_stmt = new_phi;
8983               STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8984             }
8985         }
8986     }
8987
8988   return true;
8989 }
8990
8991 /* Vectorizes LC PHIs.  */
8992
8993 bool
8994 vectorizable_lc_phi (loop_vec_info loop_vinfo,
8995                      stmt_vec_info stmt_info, gimple **vec_stmt,
8996                      slp_tree slp_node)
8997 {
8998   if (!loop_vinfo
8999       || !is_a <gphi *> (stmt_info->stmt)
9000       || gimple_phi_num_args (stmt_info->stmt) != 1)
9001     return false;
9002
9003   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
9004       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
9005     return false;
9006
9007   if (!vec_stmt) /* transformation not required.  */
9008     {
9009       /* Deal with copies from externs or constants that disguise as
9010          loop-closed PHI nodes (PR97886).  */
9011       if (slp_node
9012           && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
9013                                                 SLP_TREE_VECTYPE (slp_node)))
9014         {
9015           if (dump_enabled_p ())
9016             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9017                              "incompatible vector types for invariants\n");
9018           return false;
9019         }
9020       STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
9021       return true;
9022     }
9023
9024   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9025   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
9026   basic_block bb = gimple_bb (stmt_info->stmt);
9027   edge e = single_pred_edge (bb);
9028   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
9029   auto_vec<tree> vec_oprnds;
9030   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
9031                      !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
9032                      gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
9033   for (unsigned i = 0; i < vec_oprnds.length (); i++)
9034     {
9035       /* Create the vectorized LC PHI node.  */
9036       gphi *new_phi = create_phi_node (vec_dest, bb);
9037       add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
9038       if (slp_node)
9039         slp_node->push_vec_def (new_phi);
9040       else
9041         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
9042     }
9043   if (!slp_node)
9044     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
9045
9046   return true;
9047 }
9048
9049 /* Vectorizes PHIs.  */
9050
9051 bool
9052 vectorizable_phi (vec_info *,
9053                   stmt_vec_info stmt_info, gimple **vec_stmt,
9054                   slp_tree slp_node, stmt_vector_for_cost *cost_vec)
9055 {
9056   if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
9057     return false;
9058
9059   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
9060     return false;
9061
9062   tree vectype = SLP_TREE_VECTYPE (slp_node);
9063
9064   if (!vec_stmt) /* transformation not required.  */
9065     {
9066       slp_tree child;
9067       unsigned i;
9068       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
9069         if (!child)
9070           {
9071             if (dump_enabled_p ())
9072               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9073                                "PHI node with unvectorized backedge def\n");
9074             return false;
9075           }
9076         else if (!vect_maybe_update_slp_op_vectype (child, vectype))
9077           {
9078             if (dump_enabled_p ())
9079               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9080                                "incompatible vector types for invariants\n");
9081             return false;
9082           }
9083         else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
9084                  && !useless_type_conversion_p (vectype,
9085                                                 SLP_TREE_VECTYPE (child)))
9086           {
9087             /* With bools we can have mask and non-mask precision vectors
9088                or different non-mask precisions.  while pattern recog is
9089                supposed to guarantee consistency here bugs in it can cause
9090                mismatches (PR103489 and PR103800 for example).
9091                Deal with them here instead of ICEing later.  */
9092             if (dump_enabled_p ())
9093               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9094                                "incompatible vector type setup from "
9095                                "bool pattern detection\n");
9096             return false;
9097           }
9098
9099       /* For single-argument PHIs assume coalescing which means zero cost
9100          for the scalar and the vector PHIs.  This avoids artificially
9101          favoring the vector path (but may pessimize it in some cases).  */
9102       if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
9103         record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
9104                           vector_stmt, stmt_info, vectype, 0, vect_body);
9105       STMT_VINFO_TYPE (stmt_info) = phi_info_type;
9106       return true;
9107     }
9108
9109   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
9110   basic_block bb = gimple_bb (stmt_info->stmt);
9111   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
9112   auto_vec<gphi *> new_phis;
9113   for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
9114     {
9115       slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
9116
9117       /* Skip not yet vectorized defs.  */
9118       if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
9119           && SLP_TREE_VEC_DEFS (child).is_empty ())
9120         continue;
9121
9122       auto_vec<tree> vec_oprnds;
9123       vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
9124       if (!new_phis.exists ())
9125         {
9126           new_phis.create (vec_oprnds.length ());
9127           for (unsigned j = 0; j < vec_oprnds.length (); j++)
9128             {
9129               /* Create the vectorized LC PHI node.  */
9130               new_phis.quick_push (create_phi_node (vec_dest, bb));
9131               slp_node->push_vec_def (new_phis[j]);
9132             }
9133         }
9134       edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
9135       for (unsigned j = 0; j < vec_oprnds.length (); j++)
9136         add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
9137     }
9138   /* We should have at least one already vectorized child.  */
9139   gcc_assert (new_phis.exists ());
9140
9141   return true;
9142 }
9143
9144 /* Vectorizes first order recurrences.  An overview of the transformation
9145    is described below. Suppose we have the following loop.
9146
9147      int t = 0;
9148      for (int i = 0; i < n; ++i)
9149        {
9150          b[i] = a[i] - t;
9151          t = a[i];
9152        }
9153
9154    There is a first-order recurrence on 'a'. For this loop, the scalar IR
9155    looks (simplified) like:
9156
9157     scalar.preheader:
9158       init = 0;
9159
9160     scalar.body:
9161       i = PHI <0(scalar.preheader), i+1(scalar.body)>
9162       _2 = PHI <(init(scalar.preheader), <_1(scalar.body)>
9163       _1 = a[i]
9164       b[i] = _1 - _2
9165       if (i < n) goto scalar.body
9166
9167    In this example, _2 is a recurrence because it's value depends on the
9168    previous iteration.  We vectorize this as (VF = 4)
9169
9170     vector.preheader:
9171       vect_init = vect_cst(..., ..., ..., 0)
9172
9173     vector.body
9174       i = PHI <0(vector.preheader), i+4(vector.body)>
9175       vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)>
9176       vect_2 = a[i, i+1, i+2, i+3];
9177       vect_3 = vec_perm (vect_1, vect_2, { 3, 4, 5, 6 })
9178       b[i, i+1, i+2, i+3] = vect_2 - vect_3
9179       if (..) goto vector.body
9180
9181    In this function, vectorizable_recurr, we code generate both the
9182    vector PHI node and the permute since those together compute the
9183    vectorized value of the scalar PHI.  We do not yet have the
9184    backedge value to fill in there nor into the vec_perm.  Those
9185    are filled in maybe_set_vectorized_backedge_value and
9186    vect_schedule_scc.
9187
9188    TODO:  Since the scalar loop does not have a use of the recurrence
9189    outside of the loop the natural way to implement peeling via
9190    vectorizing the live value doesn't work.  For now peeling of loops
9191    with a recurrence is not implemented.  For SLP the supported cases
9192    are restricted to those requiring a single vector recurrence PHI.  */
9193
9194 bool
9195 vectorizable_recurr (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
9196                      gimple **vec_stmt, slp_tree slp_node,
9197                      stmt_vector_for_cost *cost_vec)
9198 {
9199   if (!loop_vinfo || !is_a<gphi *> (stmt_info->stmt))
9200     return false;
9201
9202   gphi *phi = as_a<gphi *> (stmt_info->stmt);
9203
9204   /* So far we only support first-order recurrence auto-vectorization.  */
9205   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
9206     return false;
9207
9208   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9209   unsigned ncopies;
9210   if (slp_node)
9211     ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
9212   else
9213     ncopies = vect_get_num_copies (loop_vinfo, vectype);
9214   poly_int64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9215   unsigned dist = slp_node ? SLP_TREE_LANES (slp_node) : 1;
9216   /* We need to be able to make progress with a single vector.  */
9217   if (maybe_gt (dist * 2, nunits))
9218     {
9219       if (dump_enabled_p ())
9220         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9221                          "first order recurrence exceeds half of "
9222                          "a vector\n");
9223       return false;
9224     }
9225
9226   /* First-order recurrence autovectorization needs to handle permutation
9227      with indices = [nunits-1, nunits, nunits+1, ...].  */
9228   vec_perm_builder sel (nunits, 1, 3);
9229   for (int i = 0; i < 3; ++i)
9230     sel.quick_push (nunits - dist + i);
9231   vec_perm_indices indices (sel, 2, nunits);
9232
9233   if (!vec_stmt) /* transformation not required.  */
9234     {
9235       if (!can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
9236                                  indices))
9237         return false;
9238
9239       if (slp_node)
9240         {
9241           /* We eventually need to set a vector type on invariant
9242              arguments.  */
9243           unsigned j;
9244           slp_tree child;
9245           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9246             if (!vect_maybe_update_slp_op_vectype
9247                   (child, SLP_TREE_VECTYPE (slp_node)))
9248               {
9249                 if (dump_enabled_p ())
9250                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9251                                    "incompatible vector types for "
9252                                    "invariants\n");
9253                 return false;
9254               }
9255         }
9256       /* The recurrence costs the initialization vector and one permute
9257          for each copy.  */
9258       unsigned prologue_cost = record_stmt_cost (cost_vec, 1, scalar_to_vec,
9259                                                  stmt_info, 0, vect_prologue);
9260       unsigned inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9261                                                stmt_info, 0, vect_body);
9262       if (dump_enabled_p ())
9263         dump_printf_loc (MSG_NOTE, vect_location,
9264                          "vectorizable_recurr: inside_cost = %d, "
9265                          "prologue_cost = %d .\n", inside_cost,
9266                          prologue_cost);
9267
9268       STMT_VINFO_TYPE (stmt_info) = recurr_info_type;
9269       return true;
9270     }
9271
9272   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
9273   basic_block bb = gimple_bb (phi);
9274   tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
9275   if (!useless_type_conversion_p (TREE_TYPE (vectype), TREE_TYPE (preheader)))
9276     {
9277       gimple_seq stmts = NULL;
9278       preheader = gimple_convert (&stmts, TREE_TYPE (vectype), preheader);
9279       gsi_insert_seq_on_edge_immediate (pe, stmts);
9280     }
9281   tree vec_init = build_vector_from_val (vectype, preheader);
9282   vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL);
9283
9284   /* Create the vectorized first-order PHI node.  */
9285   tree vec_dest = vect_get_new_vect_var (vectype,
9286                                          vect_simple_var, "vec_recur_");
9287   gphi *new_phi = create_phi_node (vec_dest, bb);
9288   add_phi_arg (new_phi, vec_init, pe, UNKNOWN_LOCATION);
9289
9290   /* Insert shuffles the first-order recurrence autovectorization.
9291        result = VEC_PERM <vec_recur, vect_1, index[nunits-1, nunits, ...]>.  */
9292   tree perm = vect_gen_perm_mask_checked (vectype, indices);
9293
9294   /* Insert the required permute after the latch definition.  The
9295      second and later operands are tentative and will be updated when we have
9296      vectorized the latch definition.  */
9297   edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
9298   gimple *latch_def = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, le));
9299   gimple_stmt_iterator gsi2 = gsi_for_stmt (latch_def);
9300   gsi_next (&gsi2);
9301
9302   for (unsigned i = 0; i < ncopies; ++i)
9303     {
9304       vec_dest = make_ssa_name (vectype);
9305       gassign *vperm
9306           = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
9307                                  i == 0 ? gimple_phi_result (new_phi) : NULL,
9308                                  NULL, perm);
9309       vect_finish_stmt_generation (loop_vinfo, stmt_info, vperm, &gsi2);
9310
9311       if (slp_node)
9312         slp_node->push_vec_def (vperm);
9313       else
9314         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (vperm);
9315     }
9316
9317   if (!slp_node)
9318     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
9319   return true;
9320 }
9321
9322 /* Return true if VECTYPE represents a vector that requires lowering
9323    by the vector lowering pass.  */
9324
9325 bool
9326 vect_emulated_vector_p (tree vectype)
9327 {
9328   return (!VECTOR_MODE_P (TYPE_MODE (vectype))
9329           && (!VECTOR_BOOLEAN_TYPE_P (vectype)
9330               || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
9331 }
9332
9333 /* Return true if we can emulate CODE on an integer mode representation
9334    of a vector.  */
9335
9336 bool
9337 vect_can_vectorize_without_simd_p (tree_code code)
9338 {
9339   switch (code)
9340     {
9341     case PLUS_EXPR:
9342     case MINUS_EXPR:
9343     case NEGATE_EXPR:
9344     case BIT_AND_EXPR:
9345     case BIT_IOR_EXPR:
9346     case BIT_XOR_EXPR:
9347     case BIT_NOT_EXPR:
9348       return true;
9349
9350     default:
9351       return false;
9352     }
9353 }
9354
9355 /* Likewise, but taking a code_helper.  */
9356
9357 bool
9358 vect_can_vectorize_without_simd_p (code_helper code)
9359 {
9360   return (code.is_tree_code ()
9361           && vect_can_vectorize_without_simd_p (tree_code (code)));
9362 }
9363
9364 /* Create vector init for vectorized iv.  */
9365 static tree
9366 vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9367                                tree step_expr, poly_uint64 nunits,
9368                                tree vectype,
9369                                enum vect_induction_op_type induction_type)
9370 {
9371   unsigned HOST_WIDE_INT const_nunits;
9372   tree vec_shift, vec_init, new_name;
9373   unsigned i;
9374   tree itype = TREE_TYPE (vectype);
9375
9376   /* iv_loop is the loop to be vectorized. Create:
9377      vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr).  */
9378   new_name = gimple_convert (stmts, itype, init_expr);
9379   switch (induction_type)
9380     {
9381     case vect_step_op_shr:
9382     case vect_step_op_shl:
9383       /* Build the Initial value from shift_expr.  */
9384       vec_init = gimple_build_vector_from_val (stmts,
9385                                                vectype,
9386                                                new_name);
9387       vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
9388                                 build_zero_cst (itype), step_expr);
9389       vec_init = gimple_build (stmts,
9390                                (induction_type == vect_step_op_shr
9391                                 ? RSHIFT_EXPR : LSHIFT_EXPR),
9392                                vectype, vec_init, vec_shift);
9393       break;
9394
9395     case vect_step_op_neg:
9396       {
9397         vec_init = gimple_build_vector_from_val (stmts,
9398                                                  vectype,
9399                                                  new_name);
9400         tree vec_neg = gimple_build (stmts, NEGATE_EXPR,
9401                                      vectype, vec_init);
9402         /* The encoding has 2 interleaved stepped patterns.  */
9403         vec_perm_builder sel (nunits, 2, 3);
9404         sel.quick_grow (6);
9405         for (i = 0; i < 3; i++)
9406           {
9407             sel[2 * i] = i;
9408             sel[2 * i + 1] = i + nunits;
9409           }
9410         vec_perm_indices indices (sel, 2, nunits);
9411         /* Don't use vect_gen_perm_mask_checked since can_vec_perm_const_p may
9412            fail when vec_init is const vector. In that situation vec_perm is not
9413            really needed.  */
9414         tree perm_mask_even
9415           = vect_gen_perm_mask_any (vectype, indices);
9416         vec_init = gimple_build (stmts, VEC_PERM_EXPR,
9417                                  vectype,
9418                                  vec_init, vec_neg,
9419                                  perm_mask_even);
9420       }
9421       break;
9422
9423     case vect_step_op_mul:
9424       {
9425         /* Use unsigned mult to avoid UD integer overflow.  */
9426         gcc_assert (nunits.is_constant (&const_nunits));
9427         tree utype = unsigned_type_for (itype);
9428         tree uvectype = build_vector_type (utype,
9429                                            TYPE_VECTOR_SUBPARTS (vectype));
9430         new_name = gimple_convert (stmts, utype, new_name);
9431         vec_init = gimple_build_vector_from_val (stmts,
9432                                                  uvectype,
9433                                                  new_name);
9434         tree_vector_builder elts (uvectype, const_nunits, 1);
9435         tree elt_step = build_one_cst (utype);
9436
9437         elts.quick_push (elt_step);
9438         for (i = 1; i < const_nunits; i++)
9439           {
9440             /* Create: new_name_i = new_name + step_expr.  */
9441             elt_step = gimple_build (stmts, MULT_EXPR,
9442                                      utype, elt_step, step_expr);
9443             elts.quick_push (elt_step);
9444           }
9445         /* Create a vector from [new_name_0, new_name_1, ...,
9446            new_name_nunits-1].  */
9447         tree vec_mul = gimple_build_vector (stmts, &elts);
9448         vec_init = gimple_build (stmts, MULT_EXPR, uvectype,
9449                                  vec_init, vec_mul);
9450         vec_init = gimple_convert (stmts, vectype, vec_init);
9451       }
9452       break;
9453
9454     default:
9455       gcc_unreachable ();
9456     }
9457
9458   return vec_init;
9459 }
9460
9461 /* Peel init_expr by skip_niter for induction_type.  */
9462 tree
9463 vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9464                              tree skip_niters, tree step_expr,
9465                              enum vect_induction_op_type induction_type)
9466 {
9467   gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST);
9468   tree type = TREE_TYPE (init_expr);
9469   unsigned prec = TYPE_PRECISION (type);
9470   switch (induction_type)
9471     {
9472     case vect_step_op_neg:
9473       if (TREE_INT_CST_LOW (skip_niters) % 2)
9474         init_expr = gimple_build (stmts, NEGATE_EXPR, type, init_expr);
9475       /* else no change.  */
9476       break;
9477
9478     case vect_step_op_shr:
9479     case vect_step_op_shl:
9480       skip_niters = gimple_convert (stmts, type, skip_niters);
9481       step_expr = gimple_build (stmts, MULT_EXPR, type, step_expr, skip_niters);
9482       /* When shift mount >= precision, need to avoid UD.
9483          In the original loop, there's no UD, and according to semantic,
9484          init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr.  */
9485       if (!tree_fits_uhwi_p (step_expr)
9486           || tree_to_uhwi (step_expr) >= prec)
9487         {
9488           if (induction_type == vect_step_op_shl
9489               || TYPE_UNSIGNED (type))
9490             init_expr = build_zero_cst (type);
9491           else
9492             init_expr = gimple_build (stmts, RSHIFT_EXPR, type,
9493                                       init_expr,
9494                                       wide_int_to_tree (type, prec - 1));
9495         }
9496       else
9497         init_expr = gimple_build (stmts, (induction_type == vect_step_op_shr
9498                                           ? RSHIFT_EXPR : LSHIFT_EXPR),
9499                                   type, init_expr, step_expr);
9500       break;
9501
9502     case vect_step_op_mul:
9503       {
9504         tree utype = unsigned_type_for (type);
9505         init_expr = gimple_convert (stmts, utype, init_expr);
9506         wide_int skipn = wi::to_wide (skip_niters);
9507         wide_int begin = wi::to_wide (step_expr);
9508         auto_mpz base, exp, mod, res;
9509         wi::to_mpz (begin, base, TYPE_SIGN (type));
9510         wi::to_mpz (skipn, exp, UNSIGNED);
9511         mpz_ui_pow_ui (mod, 2, TYPE_PRECISION (type));
9512         mpz_powm (res, base, exp, mod);
9513         begin = wi::from_mpz (type, res, TYPE_SIGN (type));
9514         tree mult_expr = wide_int_to_tree (utype, begin);
9515         init_expr = gimple_build (stmts, MULT_EXPR, utype,
9516                                   init_expr, mult_expr);
9517         init_expr = gimple_convert (stmts, type, init_expr);
9518       }
9519       break;
9520
9521     default:
9522       gcc_unreachable ();
9523     }
9524
9525   return init_expr;
9526 }
9527
9528 /* Create vector step for vectorized iv.  */
9529 static tree
9530 vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
9531                                poly_uint64 vf,
9532                                enum vect_induction_op_type induction_type)
9533 {
9534   tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
9535   tree new_name = NULL;
9536   /* Step should be pow (step, vf) for mult induction.  */
9537   if (induction_type == vect_step_op_mul)
9538     {
9539       gcc_assert (vf.is_constant ());
9540       wide_int begin = wi::to_wide (step_expr);
9541
9542       for (unsigned i = 0; i != vf.to_constant () - 1; i++)
9543         begin = wi::mul (begin, wi::to_wide (step_expr));
9544
9545       new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin);
9546     }
9547   else if (induction_type == vect_step_op_neg)
9548     /* Do nothing.  */
9549     ;
9550   else
9551     new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr),
9552                              expr, step_expr);
9553   return new_name;
9554 }
9555
9556 static tree
9557 vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
9558                                    stmt_vec_info stmt_info,
9559                                    tree new_name, tree vectype,
9560                                    enum vect_induction_op_type induction_type)
9561 {
9562   /* No step is needed for neg induction.  */
9563   if (induction_type == vect_step_op_neg)
9564     return NULL;
9565
9566   tree t = unshare_expr (new_name);
9567   gcc_assert (CONSTANT_CLASS_P (new_name)
9568               || TREE_CODE (new_name) == SSA_NAME);
9569   tree new_vec = build_vector_from_val (vectype, t);
9570   tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
9571                                     new_vec, vectype, NULL);
9572   return vec_step;
9573 }
9574
9575 /* Update vectorized iv with vect_step, induc_def is init.  */
9576 static tree
9577 vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
9578                           tree induc_def, tree vec_step,
9579                           enum vect_induction_op_type induction_type)
9580 {
9581   tree vec_def = induc_def;
9582   switch (induction_type)
9583     {
9584     case vect_step_op_mul:
9585       {
9586         /* Use unsigned mult to avoid UD integer overflow.  */
9587         tree uvectype
9588           = build_vector_type (unsigned_type_for (TREE_TYPE (vectype)),
9589                                TYPE_VECTOR_SUBPARTS (vectype));
9590         vec_def = gimple_convert (stmts, uvectype, vec_def);
9591         vec_step = gimple_convert (stmts, uvectype, vec_step);
9592         vec_def = gimple_build (stmts, MULT_EXPR, uvectype,
9593                                 vec_def, vec_step);
9594         vec_def = gimple_convert (stmts, vectype, vec_def);
9595       }
9596       break;
9597
9598     case vect_step_op_shr:
9599       vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype,
9600                               vec_def, vec_step);
9601       break;
9602
9603     case vect_step_op_shl:
9604       vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype,
9605                               vec_def, vec_step);
9606       break;
9607     case vect_step_op_neg:
9608       vec_def = induc_def;
9609       /* Do nothing.  */
9610       break;
9611     default:
9612       gcc_unreachable ();
9613     }
9614
9615   return vec_def;
9616
9617 }
9618
9619 /* Function vectorizable_induction
9620
9621    Check if STMT_INFO performs an nonlinear induction computation that can be
9622    vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
9623    a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
9624    basic block.
9625    Return true if STMT_INFO is vectorizable in this way.  */
9626
9627 static bool
9628 vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
9629                                   stmt_vec_info stmt_info,
9630                                   gimple **vec_stmt, slp_tree slp_node,
9631                                   stmt_vector_for_cost *cost_vec)
9632 {
9633   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9634   unsigned ncopies;
9635   bool nested_in_vect_loop = false;
9636   class loop *iv_loop;
9637   tree vec_def;
9638   edge pe = loop_preheader_edge (loop);
9639   basic_block new_bb;
9640   tree vec_init, vec_step;
9641   tree new_name;
9642   gimple *new_stmt;
9643   gphi *induction_phi;
9644   tree induc_def, vec_dest;
9645   tree init_expr, step_expr;
9646   tree niters_skip;
9647   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9648   unsigned i;
9649   gimple_stmt_iterator si;
9650
9651   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9652
9653   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9654   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9655   enum vect_induction_op_type induction_type
9656     = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9657
9658   gcc_assert (induction_type > vect_step_op_add);
9659
9660   if (slp_node)
9661     ncopies = 1;
9662   else
9663     ncopies = vect_get_num_copies (loop_vinfo, vectype);
9664   gcc_assert (ncopies >= 1);
9665
9666   /* FORNOW. Only handle nonlinear induction in the same loop.  */
9667   if (nested_in_vect_loop_p (loop, stmt_info))
9668     {
9669       if (dump_enabled_p ())
9670         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9671                          "nonlinear induction in nested loop.\n");
9672       return false;
9673     }
9674
9675   iv_loop = loop;
9676   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9677
9678   /* TODO: Support slp for nonlinear iv. There should be separate vector iv
9679      update for each iv and a permutation to generate wanted vector iv.  */
9680   if (slp_node)
9681     {
9682       if (dump_enabled_p ())
9683         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9684                          "SLP induction not supported for nonlinear"
9685                          " induction.\n");
9686       return false;
9687     }
9688
9689   if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
9690     {
9691       if (dump_enabled_p ())
9692         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9693                          "floating point nonlinear induction vectorization"
9694                          " not supported.\n");
9695       return false;
9696     }
9697
9698   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9699   init_expr = vect_phi_initial_value (phi);
9700   gcc_assert (step_expr != NULL_TREE && init_expr != NULL
9701               && TREE_CODE (step_expr) == INTEGER_CST);
9702   /* step_expr should be aligned with init_expr,
9703      .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used.  */
9704   step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
9705
9706   if (TREE_CODE (init_expr) == INTEGER_CST)
9707     init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
9708   else if (!tree_nop_conversion_p (TREE_TYPE (vectype), TREE_TYPE (init_expr)))
9709     {
9710       /* INIT_EXPR could be a bit_field, bail out for such case.  */
9711       if (dump_enabled_p ())
9712         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9713                          "nonlinear induction vectorization failed:"
9714                          " component type of vectype is not a nop conversion"
9715                          " from type of init_expr.\n");
9716       return false;
9717     }
9718
9719   switch (induction_type)
9720     {
9721     case vect_step_op_neg:
9722       if (TREE_CODE (init_expr) != INTEGER_CST
9723           && TREE_CODE (init_expr) != REAL_CST)
9724         {
9725           /* Check for backend support of NEGATE_EXPR and vec_perm.  */
9726           if (!directly_supported_p (NEGATE_EXPR, vectype))
9727             return false;
9728
9729           /* The encoding has 2 interleaved stepped patterns.  */
9730           vec_perm_builder sel (nunits, 2, 3);
9731           machine_mode mode = TYPE_MODE (vectype);
9732           sel.quick_grow (6);
9733           for (i = 0; i < 3; i++)
9734             {
9735               sel[i * 2] = i;
9736               sel[i * 2 + 1] = i + nunits;
9737             }
9738           vec_perm_indices indices (sel, 2, nunits);
9739           if (!can_vec_perm_const_p (mode, mode, indices))
9740             return false;
9741         }
9742       break;
9743
9744     case vect_step_op_mul:
9745       {
9746         /* Check for backend support of MULT_EXPR.  */
9747         if (!directly_supported_p (MULT_EXPR, vectype))
9748           return false;
9749
9750         /* ?? How to construct vector step for variable number vector.
9751            [ 1, step, pow (step, 2), pow (step, 4), .. ].  */
9752         if (!vf.is_constant ())
9753           return false;
9754       }
9755       break;
9756
9757     case vect_step_op_shr:
9758       /* Check for backend support of RSHIFT_EXPR.  */
9759       if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
9760         return false;
9761
9762       /* Don't shift more than type precision to avoid UD.  */
9763       if (!tree_fits_uhwi_p (step_expr)
9764           || maybe_ge (nunits * tree_to_uhwi (step_expr),
9765                        TYPE_PRECISION (TREE_TYPE (init_expr))))
9766         return false;
9767       break;
9768
9769     case vect_step_op_shl:
9770       /* Check for backend support of RSHIFT_EXPR.  */
9771       if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
9772         return false;
9773
9774       /* Don't shift more than type precision to avoid UD.  */
9775       if (!tree_fits_uhwi_p (step_expr)
9776           || maybe_ge (nunits * tree_to_uhwi (step_expr),
9777                        TYPE_PRECISION (TREE_TYPE (init_expr))))
9778         return false;
9779
9780       break;
9781
9782     default:
9783       gcc_unreachable ();
9784     }
9785
9786   if (!vec_stmt) /* transformation not required.  */
9787     {
9788       unsigned inside_cost = 0, prologue_cost = 0;
9789       /* loop cost for vec_loop. Neg induction doesn't have any
9790          inside_cost.  */
9791       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9792                                       stmt_info, 0, vect_body);
9793
9794       /* loop cost for vec_loop. Neg induction doesn't have any
9795          inside_cost.  */
9796       if (induction_type == vect_step_op_neg)
9797         inside_cost = 0;
9798
9799       /* prologue cost for vec_init and vec_step.  */
9800       prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9801                                         stmt_info, 0, vect_prologue);
9802
9803       if (dump_enabled_p ())
9804         dump_printf_loc (MSG_NOTE, vect_location,
9805                          "vect_model_induction_cost: inside_cost = %d, "
9806                          "prologue_cost = %d. \n", inside_cost,
9807                          prologue_cost);
9808
9809       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9810       DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
9811       return true;
9812     }
9813
9814   /* Transform.  */
9815
9816   /* Compute a vector variable, initialized with the first VF values of
9817      the induction variable.  E.g., for an iv with IV_PHI='X' and
9818      evolution S, for a vector of 4 units, we want to compute:
9819      [X, X + S, X + 2*S, X + 3*S].  */
9820
9821   if (dump_enabled_p ())
9822     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9823
9824   pe = loop_preheader_edge (iv_loop);
9825   /* Find the first insertion point in the BB.  */
9826   basic_block bb = gimple_bb (phi);
9827   si = gsi_after_labels (bb);
9828
9829   gimple_seq stmts = NULL;
9830
9831   niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9832   /* If we are using the loop mask to "peel" for alignment then we need
9833      to adjust the start value here.  */
9834   if (niters_skip != NULL_TREE)
9835     init_expr = vect_peel_nonlinear_iv_init (&stmts, init_expr, niters_skip,
9836                                              step_expr, induction_type);
9837
9838   vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr,
9839                                             step_expr, nunits, vectype,
9840                                             induction_type);
9841   if (stmts)
9842     {
9843       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9844       gcc_assert (!new_bb);
9845     }
9846
9847   stmts = NULL;
9848   new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9849                                             vf, induction_type);
9850   if (stmts)
9851     {
9852       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9853       gcc_assert (!new_bb);
9854     }
9855
9856   vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9857                                                 new_name, vectype,
9858                                                 induction_type);
9859   /* Create the following def-use cycle:
9860      loop prolog:
9861      vec_init = ...
9862      vec_step = ...
9863      loop:
9864      vec_iv = PHI <vec_init, vec_loop>
9865      ...
9866      STMT
9867      ...
9868      vec_loop = vec_iv + vec_step;  */
9869
9870   /* Create the induction-phi that defines the induction-operand.  */
9871   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
9872   induction_phi = create_phi_node (vec_dest, iv_loop->header);
9873   induc_def = PHI_RESULT (induction_phi);
9874
9875   /* Create the iv update inside the loop.  */
9876   stmts = NULL;
9877   vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9878                                       induc_def, vec_step,
9879                                       induction_type);
9880
9881   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9882   new_stmt = SSA_NAME_DEF_STMT (vec_def);
9883
9884   /* Set the arguments of the phi node:  */
9885   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9886   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9887                UNKNOWN_LOCATION);
9888
9889   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
9890   *vec_stmt = induction_phi;
9891
9892   /* In case that vectorization factor (VF) is bigger than the number
9893      of elements that we can fit in a vectype (nunits), we have to generate
9894      more than one vector stmt - i.e - we need to "unroll" the
9895      vector stmt by a factor VF/nunits.  For more details see documentation
9896      in vectorizable_operation.  */
9897
9898   if (ncopies > 1)
9899     {
9900       stmts = NULL;
9901       /* FORNOW. This restriction should be relaxed.  */
9902       gcc_assert (!nested_in_vect_loop);
9903
9904       new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9905                                                 nunits, induction_type);
9906
9907       vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9908                                                     new_name, vectype,
9909                                                     induction_type);
9910       vec_def = induc_def;
9911       for (i = 1; i < ncopies; i++)
9912         {
9913           /* vec_i = vec_prev + vec_step.  */
9914           stmts = NULL;
9915           vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9916                                               vec_def, vec_step,
9917                                               induction_type);
9918           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9919           new_stmt = SSA_NAME_DEF_STMT (vec_def);
9920           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
9921         }
9922     }
9923
9924   if (dump_enabled_p ())
9925     dump_printf_loc (MSG_NOTE, vect_location,
9926                      "transform induction: created def-use cycle: %G%G",
9927                      (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
9928
9929   return true;
9930 }
9931
9932 /* Function vectorizable_induction
9933
9934    Check if STMT_INFO performs an induction computation that can be vectorized.
9935    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
9936    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
9937    Return true if STMT_INFO is vectorizable in this way.  */
9938
9939 bool
9940 vectorizable_induction (loop_vec_info loop_vinfo,
9941                         stmt_vec_info stmt_info,
9942                         gimple **vec_stmt, slp_tree slp_node,
9943                         stmt_vector_for_cost *cost_vec)
9944 {
9945   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9946   unsigned ncopies;
9947   bool nested_in_vect_loop = false;
9948   class loop *iv_loop;
9949   tree vec_def;
9950   edge pe = loop_preheader_edge (loop);
9951   basic_block new_bb;
9952   tree new_vec, vec_init, vec_step, t;
9953   tree new_name;
9954   gimple *new_stmt;
9955   gphi *induction_phi;
9956   tree induc_def, vec_dest;
9957   tree init_expr, step_expr;
9958   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9959   unsigned i;
9960   tree expr;
9961   gimple_stmt_iterator si;
9962   enum vect_induction_op_type induction_type
9963     = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9964
9965   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9966   if (!phi)
9967     return false;
9968
9969   if (!STMT_VINFO_RELEVANT_P (stmt_info))
9970     return false;
9971
9972   /* Make sure it was recognized as induction computation.  */
9973   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
9974     return false;
9975
9976   /* Handle nonlinear induction in a separate place.  */
9977   if (induction_type != vect_step_op_add)
9978     return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
9979                                              vec_stmt, slp_node, cost_vec);
9980
9981   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9982   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9983
9984   if (slp_node)
9985     ncopies = 1;
9986   else
9987     ncopies = vect_get_num_copies (loop_vinfo, vectype);
9988   gcc_assert (ncopies >= 1);
9989
9990   /* FORNOW. These restrictions should be relaxed.  */
9991   if (nested_in_vect_loop_p (loop, stmt_info))
9992     {
9993       imm_use_iterator imm_iter;
9994       use_operand_p use_p;
9995       gimple *exit_phi;
9996       edge latch_e;
9997       tree loop_arg;
9998
9999       if (ncopies > 1)
10000         {
10001           if (dump_enabled_p ())
10002             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10003                              "multiple types in nested loop.\n");
10004           return false;
10005         }
10006
10007       exit_phi = NULL;
10008       latch_e = loop_latch_edge (loop->inner);
10009       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
10010       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
10011         {
10012           gimple *use_stmt = USE_STMT (use_p);
10013           if (is_gimple_debug (use_stmt))
10014             continue;
10015
10016           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
10017             {
10018               exit_phi = use_stmt;
10019               break;
10020             }
10021         }
10022       if (exit_phi)
10023         {
10024           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
10025           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
10026                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
10027             {
10028               if (dump_enabled_p ())
10029                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10030                                  "inner-loop induction only used outside "
10031                                  "of the outer vectorized loop.\n");
10032               return false;
10033             }
10034         }
10035
10036       nested_in_vect_loop = true;
10037       iv_loop = loop->inner;
10038     }
10039   else
10040     iv_loop = loop;
10041   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
10042
10043   if (slp_node && !nunits.is_constant ())
10044     {
10045       /* The current SLP code creates the step value element-by-element.  */
10046       if (dump_enabled_p ())
10047         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10048                          "SLP induction not supported for variable-length"
10049                          " vectors.\n");
10050       return false;
10051     }
10052
10053   if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
10054     {
10055       if (dump_enabled_p ())
10056         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10057                          "floating point induction vectorization disabled\n");
10058       return false;
10059     }
10060
10061   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
10062   gcc_assert (step_expr != NULL_TREE);
10063   if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
10064       && !type_has_mode_precision_p (TREE_TYPE (step_expr)))
10065     {
10066       if (dump_enabled_p ())
10067         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10068                          "bit-precision induction vectorization not "
10069                          "supported.\n");
10070       return false;
10071     }
10072   tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
10073
10074   /* Check for backend support of PLUS/MINUS_EXPR. */
10075   if (!directly_supported_p (PLUS_EXPR, step_vectype)
10076       || !directly_supported_p (MINUS_EXPR, step_vectype))
10077     return false;
10078
10079   if (!vec_stmt) /* transformation not required.  */
10080     {
10081       unsigned inside_cost = 0, prologue_cost = 0;
10082       if (slp_node)
10083         {
10084           /* We eventually need to set a vector type on invariant
10085              arguments.  */
10086           unsigned j;
10087           slp_tree child;
10088           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
10089             if (!vect_maybe_update_slp_op_vectype
10090                 (child, SLP_TREE_VECTYPE (slp_node)))
10091               {
10092                 if (dump_enabled_p ())
10093                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10094                                    "incompatible vector types for "
10095                                    "invariants\n");
10096                 return false;
10097               }
10098           /* loop cost for vec_loop.  */
10099           inside_cost
10100             = record_stmt_cost (cost_vec,
10101                                 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
10102                                 vector_stmt, stmt_info, 0, vect_body);
10103           /* prologue cost for vec_init (if not nested) and step.  */
10104           prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
10105                                             scalar_to_vec,
10106                                             stmt_info, 0, vect_prologue);
10107         }
10108       else /* if (!slp_node) */
10109         {
10110           /* loop cost for vec_loop.  */
10111           inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
10112                                           stmt_info, 0, vect_body);
10113           /* prologue cost for vec_init and vec_step.  */
10114           prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
10115                                             stmt_info, 0, vect_prologue);
10116         }
10117       if (dump_enabled_p ())
10118         dump_printf_loc (MSG_NOTE, vect_location,
10119                          "vect_model_induction_cost: inside_cost = %d, "
10120                          "prologue_cost = %d .\n", inside_cost,
10121                          prologue_cost);
10122
10123       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
10124       DUMP_VECT_SCOPE ("vectorizable_induction");
10125       return true;
10126     }
10127
10128   /* Transform.  */
10129
10130   /* Compute a vector variable, initialized with the first VF values of
10131      the induction variable.  E.g., for an iv with IV_PHI='X' and
10132      evolution S, for a vector of 4 units, we want to compute:
10133      [X, X + S, X + 2*S, X + 3*S].  */
10134
10135   if (dump_enabled_p ())
10136     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
10137
10138   pe = loop_preheader_edge (iv_loop);
10139   /* Find the first insertion point in the BB.  */
10140   basic_block bb = gimple_bb (phi);
10141   si = gsi_after_labels (bb);
10142
10143   /* For SLP induction we have to generate several IVs as for example
10144      with group size 3 we need
10145        [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
10146        [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2].  */
10147   if (slp_node)
10148     {
10149       /* Enforced above.  */
10150       unsigned int const_nunits = nunits.to_constant ();
10151
10152       /* The initial values are vectorized, but any lanes > group_size
10153          need adjustment.  */
10154       slp_tree init_node
10155         = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
10156
10157       /* Gather steps.  Since we do not vectorize inductions as
10158          cycles we have to reconstruct the step from SCEV data.  */
10159       unsigned group_size = SLP_TREE_LANES (slp_node);
10160       tree *steps = XALLOCAVEC (tree, group_size);
10161       tree *inits = XALLOCAVEC (tree, group_size);
10162       stmt_vec_info phi_info;
10163       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
10164         {
10165           steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
10166           if (!init_node)
10167             inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
10168                                            pe->dest_idx);
10169         }
10170
10171       /* Now generate the IVs.  */
10172       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10173       gcc_assert ((const_nunits * nvects) % group_size == 0);
10174       unsigned nivs;
10175       if (nested_in_vect_loop)
10176         nivs = nvects;
10177       else
10178         {
10179           /* Compute the number of distinct IVs we need.  First reduce
10180              group_size if it is a multiple of const_nunits so we get
10181              one IV for a group_size of 4 but const_nunits 2.  */
10182           unsigned group_sizep = group_size;
10183           if (group_sizep % const_nunits == 0)
10184             group_sizep = group_sizep / const_nunits;
10185           nivs = least_common_multiple (group_sizep,
10186                                         const_nunits) / const_nunits;
10187         }
10188       tree stept = TREE_TYPE (step_vectype);
10189       tree lupdate_mul = NULL_TREE;
10190       if (!nested_in_vect_loop)
10191         {
10192           /* The number of iterations covered in one vector iteration.  */
10193           unsigned lup_mul = (nvects * const_nunits) / group_size;
10194           lupdate_mul
10195             = build_vector_from_val (step_vectype,
10196                                      SCALAR_FLOAT_TYPE_P (stept)
10197                                      ? build_real_from_wide (stept, lup_mul,
10198                                                              UNSIGNED)
10199                                      : build_int_cstu (stept, lup_mul));
10200         }
10201       tree peel_mul = NULL_TREE;
10202       gimple_seq init_stmts = NULL;
10203       if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
10204         {
10205           if (SCALAR_FLOAT_TYPE_P (stept))
10206             peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
10207                                      LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
10208           else
10209             peel_mul = gimple_convert (&init_stmts, stept,
10210                                        LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
10211           peel_mul = gimple_build_vector_from_val (&init_stmts,
10212                                                    step_vectype, peel_mul);
10213         }
10214       unsigned ivn;
10215       auto_vec<tree> vec_steps;
10216       for (ivn = 0; ivn < nivs; ++ivn)
10217         {
10218           tree_vector_builder step_elts (step_vectype, const_nunits, 1);
10219           tree_vector_builder init_elts (vectype, const_nunits, 1);
10220           tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
10221           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
10222             {
10223               /* The scalar steps of the IVs.  */
10224               tree elt = steps[(ivn*const_nunits + eltn) % group_size];
10225               elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
10226               step_elts.quick_push (elt);
10227               if (!init_node)
10228                 {
10229                   /* The scalar inits of the IVs if not vectorized.  */
10230                   elt = inits[(ivn*const_nunits + eltn) % group_size];
10231                   if (!useless_type_conversion_p (TREE_TYPE (vectype),
10232                                                   TREE_TYPE (elt)))
10233                     elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
10234                                         TREE_TYPE (vectype), elt);
10235                   init_elts.quick_push (elt);
10236                 }
10237               /* The number of steps to add to the initial values.  */
10238               unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
10239               mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
10240                                    ? build_real_from_wide (stept,
10241                                                            mul_elt, UNSIGNED)
10242                                    : build_int_cstu (stept, mul_elt));
10243             }
10244           vec_step = gimple_build_vector (&init_stmts, &step_elts);
10245           vec_steps.safe_push (vec_step);
10246           tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
10247           if (peel_mul)
10248             step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
10249                                      step_mul, peel_mul);
10250           if (!init_node)
10251             vec_init = gimple_build_vector (&init_stmts, &init_elts);
10252
10253           /* Create the induction-phi that defines the induction-operand.  */
10254           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
10255                                             "vec_iv_");
10256           induction_phi = create_phi_node (vec_dest, iv_loop->header);
10257           induc_def = PHI_RESULT (induction_phi);
10258
10259           /* Create the iv update inside the loop  */
10260           tree up = vec_step;
10261           if (lupdate_mul)
10262             up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10263                                vec_step, lupdate_mul);
10264           gimple_seq stmts = NULL;
10265           vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10266           vec_def = gimple_build (&stmts,
10267                                   PLUS_EXPR, step_vectype, vec_def, up);
10268           vec_def = gimple_convert (&stmts, vectype, vec_def);
10269           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10270           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10271                        UNKNOWN_LOCATION);
10272
10273           if (init_node)
10274             vec_init = vect_get_slp_vect_def (init_node, ivn);
10275           if (!nested_in_vect_loop
10276               && !integer_zerop (step_mul))
10277             {
10278               vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
10279               up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10280                                  vec_step, step_mul);
10281               vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
10282                                       vec_def, up);
10283               vec_init = gimple_convert (&init_stmts, vectype, vec_def);
10284             }
10285
10286           /* Set the arguments of the phi node:  */
10287           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10288
10289           slp_node->push_vec_def (induction_phi);
10290         }
10291       if (!nested_in_vect_loop)
10292         {
10293           /* Fill up to the number of vectors we need for the whole group.  */
10294           nivs = least_common_multiple (group_size,
10295                                         const_nunits) / const_nunits;
10296           vec_steps.reserve (nivs-ivn);
10297           for (; ivn < nivs; ++ivn)
10298             {
10299               slp_node->push_vec_def (SLP_TREE_VEC_DEFS (slp_node)[0]);
10300               vec_steps.quick_push (vec_steps[0]);
10301             }
10302         }
10303
10304       /* Re-use IVs when we can.  We are generating further vector
10305          stmts by adding VF' * stride to the IVs generated above.  */
10306       if (ivn < nvects)
10307         {
10308           unsigned vfp
10309             = least_common_multiple (group_size, const_nunits) / group_size;
10310           tree lupdate_mul
10311             = build_vector_from_val (step_vectype,
10312                                      SCALAR_FLOAT_TYPE_P (stept)
10313                                      ? build_real_from_wide (stept,
10314                                                              vfp, UNSIGNED)
10315                                      : build_int_cstu (stept, vfp));
10316           for (; ivn < nvects; ++ivn)
10317             {
10318               gimple *iv
10319                 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (slp_node)[ivn - nivs]);
10320               tree def = gimple_get_lhs (iv);
10321               if (ivn < 2*nivs)
10322                 vec_steps[ivn - nivs]
10323                   = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10324                                   vec_steps[ivn - nivs], lupdate_mul);
10325               gimple_seq stmts = NULL;
10326               def = gimple_convert (&stmts, step_vectype, def);
10327               def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10328                                   def, vec_steps[ivn % nivs]);
10329               def = gimple_convert (&stmts, vectype, def);
10330               if (gimple_code (iv) == GIMPLE_PHI)
10331                 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10332               else
10333                 {
10334                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
10335                   gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
10336                 }
10337               slp_node->push_vec_def (def);
10338             }
10339         }
10340
10341       new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
10342       gcc_assert (!new_bb);
10343
10344       return true;
10345     }
10346
10347   init_expr = vect_phi_initial_value (phi);
10348
10349   gimple_seq stmts = NULL;
10350   if (!nested_in_vect_loop)
10351     {
10352       /* Convert the initial value to the IV update type.  */
10353       tree new_type = TREE_TYPE (step_expr);
10354       init_expr = gimple_convert (&stmts, new_type, init_expr);
10355
10356       /* If we are using the loop mask to "peel" for alignment then we need
10357          to adjust the start value here.  */
10358       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
10359       if (skip_niters != NULL_TREE)
10360         {
10361           if (FLOAT_TYPE_P (vectype))
10362             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
10363                                         skip_niters);
10364           else
10365             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
10366           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
10367                                          skip_niters, step_expr);
10368           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
10369                                     init_expr, skip_step);
10370         }
10371     }
10372
10373   if (stmts)
10374     {
10375       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10376       gcc_assert (!new_bb);
10377     }
10378
10379   /* Create the vector that holds the initial_value of the induction.  */
10380   if (nested_in_vect_loop)
10381     {
10382       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
10383          been created during vectorization of previous stmts.  We obtain it
10384          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
10385       auto_vec<tree> vec_inits;
10386       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
10387                                      init_expr, &vec_inits);
10388       vec_init = vec_inits[0];
10389       /* If the initial value is not of proper type, convert it.  */
10390       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
10391         {
10392           new_stmt
10393             = gimple_build_assign (vect_get_new_ssa_name (vectype,
10394                                                           vect_simple_var,
10395                                                           "vec_iv_"),
10396                                    VIEW_CONVERT_EXPR,
10397                                    build1 (VIEW_CONVERT_EXPR, vectype,
10398                                            vec_init));
10399           vec_init = gimple_assign_lhs (new_stmt);
10400           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
10401                                                  new_stmt);
10402           gcc_assert (!new_bb);
10403         }
10404     }
10405   else
10406     {
10407       /* iv_loop is the loop to be vectorized. Create:
10408          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
10409       stmts = NULL;
10410       new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
10411
10412       unsigned HOST_WIDE_INT const_nunits;
10413       if (nunits.is_constant (&const_nunits))
10414         {
10415           tree_vector_builder elts (step_vectype, const_nunits, 1);
10416           elts.quick_push (new_name);
10417           for (i = 1; i < const_nunits; i++)
10418             {
10419               /* Create: new_name_i = new_name + step_expr  */
10420               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
10421                                        new_name, step_expr);
10422               elts.quick_push (new_name);
10423             }
10424           /* Create a vector from [new_name_0, new_name_1, ...,
10425              new_name_nunits-1]  */
10426           vec_init = gimple_build_vector (&stmts, &elts);
10427         }
10428       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
10429         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
10430         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
10431                                  new_name, step_expr);
10432       else
10433         {
10434           /* Build:
10435                 [base, base, base, ...]
10436                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
10437           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
10438           gcc_assert (flag_associative_math);
10439           tree index = build_index_vector (step_vectype, 0, 1);
10440           tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10441                                                         new_name);
10442           tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10443                                                         step_expr);
10444           vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
10445           vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
10446                                    vec_init, step_vec);
10447           vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10448                                    vec_init, base_vec);
10449         }
10450       vec_init = gimple_convert (&stmts, vectype, vec_init);
10451
10452       if (stmts)
10453         {
10454           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10455           gcc_assert (!new_bb);
10456         }
10457     }
10458
10459
10460   /* Create the vector that holds the step of the induction.  */
10461   gimple_stmt_iterator *step_iv_si = NULL;
10462   if (nested_in_vect_loop)
10463     /* iv_loop is nested in the loop to be vectorized. Generate:
10464        vec_step = [S, S, S, S]  */
10465     new_name = step_expr;
10466   else if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
10467     {
10468       /* When we're using loop_len produced by SELEC_VL, the non-final
10469          iterations are not always processing VF elements.  So vectorize
10470          induction variable instead of
10471
10472            _21 = vect_vec_iv_.6_22 + { VF, ... };
10473
10474          We should generate:
10475
10476            _35 = .SELECT_VL (ivtmp_33, VF);
10477            vect_cst__22 = [vec_duplicate_expr] _35;
10478            _21 = vect_vec_iv_.6_22 + vect_cst__22;  */
10479       gcc_assert (!slp_node);
10480       gimple_seq seq = NULL;
10481       vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
10482       tree len = vect_get_loop_len (loop_vinfo, NULL, lens, 1, vectype, 0, 0);
10483       expr = force_gimple_operand (fold_convert (TREE_TYPE (step_expr),
10484                                                  unshare_expr (len)),
10485                                    &seq, true, NULL_TREE);
10486       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr), expr,
10487                                step_expr);
10488       gsi_insert_seq_before (&si, seq, GSI_SAME_STMT);
10489       step_iv_si = &si;
10490     }
10491   else
10492     {
10493       /* iv_loop is the loop to be vectorized. Generate:
10494           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
10495       gimple_seq seq = NULL;
10496       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10497         {
10498           expr = build_int_cst (integer_type_node, vf);
10499           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10500         }
10501       else
10502         expr = build_int_cst (TREE_TYPE (step_expr), vf);
10503       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10504                                expr, step_expr);
10505       if (seq)
10506         {
10507           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10508           gcc_assert (!new_bb);
10509         }
10510     }
10511
10512   t = unshare_expr (new_name);
10513   gcc_assert (CONSTANT_CLASS_P (new_name)
10514               || TREE_CODE (new_name) == SSA_NAME);
10515   new_vec = build_vector_from_val (step_vectype, t);
10516   vec_step = vect_init_vector (loop_vinfo, stmt_info,
10517                                new_vec, step_vectype, step_iv_si);
10518
10519
10520   /* Create the following def-use cycle:
10521      loop prolog:
10522          vec_init = ...
10523          vec_step = ...
10524      loop:
10525          vec_iv = PHI <vec_init, vec_loop>
10526          ...
10527          STMT
10528          ...
10529          vec_loop = vec_iv + vec_step;  */
10530
10531   /* Create the induction-phi that defines the induction-operand.  */
10532   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
10533   induction_phi = create_phi_node (vec_dest, iv_loop->header);
10534   induc_def = PHI_RESULT (induction_phi);
10535
10536   /* Create the iv update inside the loop  */
10537   stmts = NULL;
10538   vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10539   vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
10540   vec_def = gimple_convert (&stmts, vectype, vec_def);
10541   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10542   new_stmt = SSA_NAME_DEF_STMT (vec_def);
10543
10544   /* Set the arguments of the phi node:  */
10545   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10546   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10547                UNKNOWN_LOCATION);
10548
10549   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
10550   *vec_stmt = induction_phi;
10551
10552   /* In case that vectorization factor (VF) is bigger than the number
10553      of elements that we can fit in a vectype (nunits), we have to generate
10554      more than one vector stmt - i.e - we need to "unroll" the
10555      vector stmt by a factor VF/nunits.  For more details see documentation
10556      in vectorizable_operation.  */
10557
10558   if (ncopies > 1)
10559     {
10560       gimple_seq seq = NULL;
10561       /* FORNOW. This restriction should be relaxed.  */
10562       gcc_assert (!nested_in_vect_loop);
10563       /* We expect LOOP_VINFO_USING_SELECT_VL_P to be false if ncopies > 1.  */
10564       gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
10565
10566       /* Create the vector that holds the step of the induction.  */
10567       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10568         {
10569           expr = build_int_cst (integer_type_node, nunits);
10570           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10571         }
10572       else
10573         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
10574       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10575                                expr, step_expr);
10576       if (seq)
10577         {
10578           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10579           gcc_assert (!new_bb);
10580         }
10581
10582       t = unshare_expr (new_name);
10583       gcc_assert (CONSTANT_CLASS_P (new_name)
10584                   || TREE_CODE (new_name) == SSA_NAME);
10585       new_vec = build_vector_from_val (step_vectype, t);
10586       vec_step = vect_init_vector (loop_vinfo, stmt_info,
10587                                    new_vec, step_vectype, NULL);
10588
10589       vec_def = induc_def;
10590       for (i = 1; i < ncopies + 1; i++)
10591         {
10592           /* vec_i = vec_prev + vec_step  */
10593           gimple_seq stmts = NULL;
10594           vec_def = gimple_convert (&stmts, step_vectype, vec_def);
10595           vec_def = gimple_build (&stmts,
10596                                   PLUS_EXPR, step_vectype, vec_def, vec_step);
10597           vec_def = gimple_convert (&stmts, vectype, vec_def);
10598
10599           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10600           if (i < ncopies)
10601             {
10602               new_stmt = SSA_NAME_DEF_STMT (vec_def);
10603               STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
10604             }
10605           else
10606             {
10607               /* vec_1 = vec_iv + (VF/n * S)
10608                  vec_2 = vec_1 + (VF/n * S)
10609                  ...
10610                  vec_n = vec_prev + (VF/n * S) = vec_iv + VF * S = vec_loop
10611
10612                  vec_n is used as vec_loop to save the large step register and
10613                  related operations.  */
10614               add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10615                            UNKNOWN_LOCATION);
10616             }
10617         }
10618     }
10619
10620   if (dump_enabled_p ())
10621     dump_printf_loc (MSG_NOTE, vect_location,
10622                      "transform induction: created def-use cycle: %G%G",
10623                      (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
10624
10625   return true;
10626 }
10627
10628 /* Function vectorizable_live_operation_1.
10629
10630    helper function for vectorizable_live_operation.  */
10631
10632 static tree
10633 vectorizable_live_operation_1 (loop_vec_info loop_vinfo,
10634                                stmt_vec_info stmt_info, basic_block exit_bb,
10635                                tree vectype, int ncopies, slp_tree slp_node,
10636                                tree bitsize, tree bitstart, tree vec_lhs,
10637                                tree lhs_type, gimple_stmt_iterator *exit_gsi)
10638 {
10639   gcc_assert (single_pred_p (exit_bb) || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
10640
10641   tree vec_lhs_phi = copy_ssa_name (vec_lhs);
10642   gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
10643   for (unsigned i = 0; i < gimple_phi_num_args (phi); i++)
10644     SET_PHI_ARG_DEF (phi, i, vec_lhs);
10645
10646   gimple_seq stmts = NULL;
10647   tree new_tree;
10648
10649   /* If bitstart is 0 then we can use a BIT_FIELD_REF  */
10650   if (integer_zerop (bitstart))
10651     {
10652       tree scalar_res = gimple_build (&stmts, BIT_FIELD_REF, TREE_TYPE (vectype),
10653                                       vec_lhs_phi, bitsize, bitstart);
10654
10655       /* Convert the extracted vector element to the scalar type.  */
10656       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10657     }
10658   else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
10659     {
10660       /* Emit:
10661
10662          SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
10663
10664          where VEC_LHS is the vectorized live-out result and MASK is
10665          the loop mask for the final iteration.  */
10666       gcc_assert (ncopies == 1 && !slp_node);
10667       gimple_seq tem = NULL;
10668       gimple_stmt_iterator gsi = gsi_last (tem);
10669       tree len = vect_get_loop_len (loop_vinfo, &gsi,
10670                                     &LOOP_VINFO_LENS (loop_vinfo),
10671                                     1, vectype, 0, 0);
10672
10673       /* BIAS - 1.  */
10674       signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
10675       tree bias_minus_one
10676         = int_const_binop (MINUS_EXPR,
10677                            build_int_cst (TREE_TYPE (len), biasval),
10678                            build_one_cst (TREE_TYPE (len)));
10679
10680       /* LAST_INDEX = LEN + (BIAS - 1).  */
10681       tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
10682                                      len, bias_minus_one);
10683
10684       /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>.  */
10685       tree scalar_res
10686         = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
10687                         vec_lhs_phi, last_index);
10688
10689       /* Convert the extracted vector element to the scalar type.  */
10690       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10691     }
10692   else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
10693     {
10694       /* Emit:
10695
10696          SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
10697
10698          where VEC_LHS is the vectorized live-out result and MASK is
10699          the loop mask for the final iteration.  */
10700       gcc_assert (!slp_node);
10701       tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
10702       gimple_seq tem = NULL;
10703       gimple_stmt_iterator gsi = gsi_last (tem);
10704       tree mask = vect_get_loop_mask (loop_vinfo, &gsi,
10705                                       &LOOP_VINFO_MASKS (loop_vinfo),
10706                                       1, vectype, 0);
10707       tree scalar_res;
10708       gimple_seq_add_seq (&stmts, tem);
10709
10710       scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
10711                                  mask, vec_lhs_phi);
10712
10713       /* Convert the extracted vector element to the scalar type.  */
10714       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10715     }
10716   else
10717     {
10718       tree bftype = TREE_TYPE (vectype);
10719       if (VECTOR_BOOLEAN_TYPE_P (vectype))
10720         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10721       new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs_phi, bitsize, bitstart);
10722       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10723                                        &stmts, true, NULL_TREE);
10724     }
10725
10726   *exit_gsi = gsi_after_labels (exit_bb);
10727   if (stmts)
10728     gsi_insert_seq_before (exit_gsi, stmts, GSI_SAME_STMT);
10729
10730   return new_tree;
10731 }
10732
10733 /* Function vectorizable_live_operation.
10734
10735    STMT_INFO computes a value that is used outside the loop.  Check if
10736    it can be supported.  */
10737
10738 bool
10739 vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
10740                              slp_tree slp_node, slp_instance slp_node_instance,
10741                              int slp_index, bool vec_stmt_p,
10742                              stmt_vector_for_cost *cost_vec)
10743 {
10744   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
10745   imm_use_iterator imm_iter;
10746   tree lhs, lhs_type, bitsize;
10747   tree vectype = (slp_node
10748                   ? SLP_TREE_VECTYPE (slp_node)
10749                   : STMT_VINFO_VECTYPE (stmt_info));
10750   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10751   int ncopies;
10752   gimple *use_stmt;
10753   use_operand_p use_p;
10754   auto_vec<tree> vec_oprnds;
10755   int vec_entry = 0;
10756   poly_uint64 vec_index = 0;
10757
10758   gcc_assert (STMT_VINFO_LIVE_P (stmt_info)
10759               || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
10760
10761   /* If a stmt of a reduction is live, vectorize it via
10762      vect_create_epilog_for_reduction.  vectorizable_reduction assessed
10763      validity so just trigger the transform here.  */
10764   if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
10765     {
10766       if (!vec_stmt_p)
10767         return true;
10768       if (slp_node)
10769         {
10770           /* For reduction chains the meta-info is attached to
10771              the group leader.  */
10772           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
10773             stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
10774           /* For SLP reductions we vectorize the epilogue for
10775              all involved stmts together.  */
10776           else if (slp_index != 0)
10777             return true;
10778         }
10779       stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
10780       gcc_assert (reduc_info->is_reduc_info);
10781       if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
10782           || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
10783         return true;
10784
10785       vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
10786                                         slp_node_instance,
10787                                         LOOP_VINFO_IV_EXIT (loop_vinfo));
10788
10789       /* If early break we only have to materialize the reduction on the merge
10790          block, but we have to find an alternate exit first.  */
10791       if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
10792         {
10793           for (auto exit : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
10794             if (exit != LOOP_VINFO_IV_EXIT (loop_vinfo))
10795               {
10796                 vect_create_epilog_for_reduction (loop_vinfo, stmt_info,
10797                                                   slp_node, slp_node_instance,
10798                                                   exit);
10799                 break;
10800               }
10801         }
10802
10803       return true;
10804     }
10805
10806   /* If STMT is not relevant and it is a simple assignment and its inputs are
10807      invariant then it can remain in place, unvectorized.  The original last
10808      scalar value that it computes will be used.  */
10809   if (!STMT_VINFO_RELEVANT_P (stmt_info))
10810     {
10811       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
10812       if (dump_enabled_p ())
10813         dump_printf_loc (MSG_NOTE, vect_location,
10814                          "statement is simple and uses invariant.  Leaving in "
10815                          "place.\n");
10816       return true;
10817     }
10818
10819   if (slp_node)
10820     ncopies = 1;
10821   else
10822     ncopies = vect_get_num_copies (loop_vinfo, vectype);
10823
10824   if (slp_node)
10825     {
10826       gcc_assert (slp_index >= 0);
10827
10828       /* Get the last occurrence of the scalar index from the concatenation of
10829          all the slp vectors. Calculate which slp vector it is and the index
10830          within.  */
10831       int num_scalar = SLP_TREE_LANES (slp_node);
10832       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10833       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
10834
10835       /* Calculate which vector contains the result, and which lane of
10836          that vector we need.  */
10837       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
10838         {
10839           if (dump_enabled_p ())
10840             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10841                              "Cannot determine which vector holds the"
10842                              " final result.\n");
10843           return false;
10844         }
10845     }
10846
10847   if (!vec_stmt_p)
10848     {
10849       /* No transformation required.  */
10850       if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
10851         {
10852           if (slp_node)
10853             {
10854               if (dump_enabled_p ())
10855                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10856                                  "can't operate on partial vectors "
10857                                  "because an SLP statement is live after "
10858                                  "the loop.\n");
10859               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10860             }
10861           else if (ncopies > 1)
10862             {
10863               if (dump_enabled_p ())
10864                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10865                                  "can't operate on partial vectors "
10866                                  "because ncopies is greater than 1.\n");
10867               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10868             }
10869           else
10870             {
10871               gcc_assert (ncopies == 1 && !slp_node);
10872               if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
10873                                                   OPTIMIZE_FOR_SPEED))
10874                 vect_record_loop_mask (loop_vinfo,
10875                                        &LOOP_VINFO_MASKS (loop_vinfo),
10876                                        1, vectype, NULL);
10877               else if (can_vec_extract_var_idx_p (
10878                          TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
10879                 vect_record_loop_len (loop_vinfo,
10880                                       &LOOP_VINFO_LENS (loop_vinfo),
10881                                       1, vectype, 1);
10882               else
10883                 {
10884                   if (dump_enabled_p ())
10885                     dump_printf_loc (
10886                       MSG_MISSED_OPTIMIZATION, vect_location,
10887                       "can't operate on partial vectors "
10888                       "because the target doesn't support extract "
10889                       "last reduction.\n");
10890                   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10891                 }
10892             }
10893         }
10894       /* ???  Enable for loop costing as well.  */
10895       if (!loop_vinfo)
10896         record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
10897                           0, vect_epilogue);
10898       return true;
10899     }
10900
10901   /* Use the lhs of the original scalar statement.  */
10902   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
10903   if (dump_enabled_p ())
10904     dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
10905                      "stmt %G", stmt);
10906
10907   lhs = gimple_get_lhs (stmt);
10908   lhs_type = TREE_TYPE (lhs);
10909
10910   bitsize = vector_element_bits_tree (vectype);
10911
10912   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
10913   tree vec_lhs, vec_lhs0, bitstart;
10914   gimple *vec_stmt, *vec_stmt0;
10915   if (slp_node)
10916     {
10917       gcc_assert (!loop_vinfo
10918                   || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
10919                       && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)));
10920
10921       /* Get the correct slp vectorized stmt.  */
10922       vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
10923       vec_stmt = SSA_NAME_DEF_STMT (vec_lhs);
10924
10925       /* In case we need to early break vectorize also get the first stmt.  */
10926       vec_lhs0 = SLP_TREE_VEC_DEFS (slp_node)[0];
10927       vec_stmt0 = SSA_NAME_DEF_STMT (vec_lhs0);
10928
10929       /* Get entry to use.  */
10930       bitstart = bitsize_int (vec_index);
10931       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
10932     }
10933   else
10934     {
10935       /* For multiple copies, get the last copy.  */
10936       vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
10937       vec_lhs = gimple_get_lhs (vec_stmt);
10938
10939       /* In case we need to early break vectorize also get the first stmt.  */
10940       vec_stmt0 = STMT_VINFO_VEC_STMTS (stmt_info)[0];
10941       vec_lhs0 = gimple_get_lhs (vec_stmt0);
10942
10943       /* Get the last lane in the vector.  */
10944       bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
10945     }
10946
10947   if (loop_vinfo)
10948     {
10949       /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
10950          requirement, insert one phi node for it.  It looks like:
10951            loop;
10952          BB:
10953            # lhs' = PHI <lhs>
10954          ==>
10955            loop;
10956          BB:
10957            # vec_lhs' = PHI <vec_lhs>
10958            new_tree = lane_extract <vec_lhs', ...>;
10959            lhs' = new_tree;  */
10960
10961       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10962       /* Check if we have a loop where the chosen exit is not the main exit,
10963          in these cases for an early break we restart the iteration the vector code
10964          did.  For the live values we want the value at the start of the iteration
10965          rather than at the end.  */
10966       edge main_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
10967       bool all_exits_as_early_p = LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo);
10968       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10969         if (!is_gimple_debug (use_stmt)
10970             && !flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
10971           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10972             {
10973               edge e = gimple_phi_arg_edge (as_a <gphi *> (use_stmt),
10974                                            phi_arg_index_from_use (use_p));
10975               gcc_assert (loop_exit_edge_p (loop, e));
10976               bool main_exit_edge = e == main_e;
10977               tree tmp_vec_lhs = vec_lhs;
10978               tree tmp_bitstart = bitstart;
10979
10980               /* For early exit where the exit is not in the BB that leads
10981                  to the latch then we're restarting the iteration in the
10982                  scalar loop.  So get the first live value.  */
10983               if ((all_exits_as_early_p || !main_exit_edge)
10984                   && STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
10985                 {
10986                   tmp_vec_lhs = vec_lhs0;
10987                   tmp_bitstart = build_zero_cst (TREE_TYPE (bitstart));
10988                 }
10989
10990               gimple_stmt_iterator exit_gsi;
10991               tree new_tree
10992                 = vectorizable_live_operation_1 (loop_vinfo, stmt_info,
10993                                                  e->dest, vectype, ncopies,
10994                                                  slp_node, bitsize,
10995                                                  tmp_bitstart, tmp_vec_lhs,
10996                                                  lhs_type, &exit_gsi);
10997
10998               auto gsi = gsi_for_stmt (use_stmt);
10999               remove_phi_node (&gsi, false);
11000               tree lhs_phi = gimple_phi_result (use_stmt);
11001               gimple *copy = gimple_build_assign (lhs_phi, new_tree);
11002               gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
11003               break;
11004             }
11005
11006       /* There a no further out-of-loop uses of lhs by LC-SSA construction.  */
11007       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
11008         gcc_assert (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)));
11009     }
11010   else
11011     {
11012       /* For basic-block vectorization simply insert the lane-extraction.  */
11013       tree bftype = TREE_TYPE (vectype);
11014       if (VECTOR_BOOLEAN_TYPE_P (vectype))
11015         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
11016       tree new_tree = build3 (BIT_FIELD_REF, bftype,
11017                               vec_lhs, bitsize, bitstart);
11018       gimple_seq stmts = NULL;
11019       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
11020                                        &stmts, true, NULL_TREE);
11021       if (TREE_CODE (new_tree) == SSA_NAME
11022           && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
11023         SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
11024       if (is_a <gphi *> (vec_stmt))
11025         {
11026           gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
11027           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
11028         }
11029       else
11030         {
11031           gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
11032           gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
11033         }
11034
11035       /* Replace use of lhs with newly computed result.  If the use stmt is a
11036          single arg PHI, just replace all uses of PHI result.  It's necessary
11037          because lcssa PHI defining lhs may be before newly inserted stmt.  */
11038       use_operand_p use_p;
11039       stmt_vec_info use_stmt_info;
11040       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
11041         if (!is_gimple_debug (use_stmt)
11042             && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
11043                 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
11044           {
11045             /* ???  This can happen when the live lane ends up being
11046                rooted in a vector construction code-generated by an
11047                external SLP node (and code-generation for that already
11048                happened).  See gcc.dg/vect/bb-slp-47.c.
11049                Doing this is what would happen if that vector CTOR
11050                were not code-generated yet so it is not too bad.
11051                ???  In fact we'd likely want to avoid this situation
11052                in the first place.  */
11053             if (TREE_CODE (new_tree) == SSA_NAME
11054                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
11055                 && gimple_code (use_stmt) != GIMPLE_PHI
11056                 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
11057                                                 use_stmt))
11058               {
11059                 if (dump_enabled_p ())
11060                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11061                                    "Using original scalar computation for "
11062                                    "live lane because use preceeds vector "
11063                                    "def\n");
11064                 continue;
11065               }
11066             /* ???  It can also happen that we end up pulling a def into
11067                a loop where replacing out-of-loop uses would require
11068                a new LC SSA PHI node.  Retain the original scalar in
11069                those cases as well.  PR98064.  */
11070             if (TREE_CODE (new_tree) == SSA_NAME
11071                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
11072                 && (gimple_bb (use_stmt)->loop_father
11073                     != gimple_bb (vec_stmt)->loop_father)
11074                 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
11075                                         gimple_bb (use_stmt)->loop_father))
11076               {
11077                 if (dump_enabled_p ())
11078                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11079                                    "Using original scalar computation for "
11080                                    "live lane because there is an out-of-loop "
11081                                    "definition for it\n");
11082                 continue;
11083               }
11084             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
11085               SET_USE (use_p, new_tree);
11086             update_stmt (use_stmt);
11087           }
11088     }
11089
11090   return true;
11091 }
11092
11093 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
11094
11095 static void
11096 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
11097 {
11098   ssa_op_iter op_iter;
11099   imm_use_iterator imm_iter;
11100   def_operand_p def_p;
11101   gimple *ustmt;
11102
11103   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
11104     {
11105       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
11106         {
11107           basic_block bb;
11108
11109           if (!is_gimple_debug (ustmt))
11110             continue;
11111
11112           bb = gimple_bb (ustmt);
11113
11114           if (!flow_bb_inside_loop_p (loop, bb))
11115             {
11116               if (gimple_debug_bind_p (ustmt))
11117                 {
11118                   if (dump_enabled_p ())
11119                     dump_printf_loc (MSG_NOTE, vect_location,
11120                                      "killing debug use\n");
11121
11122                   gimple_debug_bind_reset_value (ustmt);
11123                   update_stmt (ustmt);
11124                 }
11125               else
11126                 gcc_unreachable ();
11127             }
11128         }
11129     }
11130 }
11131
11132 /* Given loop represented by LOOP_VINFO, return true if computation of
11133    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
11134    otherwise.  */
11135
11136 static bool
11137 loop_niters_no_overflow (loop_vec_info loop_vinfo)
11138 {
11139   /* Constant case.  */
11140   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
11141     {
11142       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
11143       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
11144
11145       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
11146       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
11147       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
11148         return true;
11149     }
11150
11151   widest_int max;
11152   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11153   /* Check the upper bound of loop niters.  */
11154   if (get_max_loop_iterations (loop, &max))
11155     {
11156       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
11157       signop sgn = TYPE_SIGN (type);
11158       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
11159       if (max < type_max)
11160         return true;
11161     }
11162   return false;
11163 }
11164
11165 /* Return a mask type with half the number of elements as OLD_TYPE,
11166    given that it should have mode NEW_MODE.  */
11167
11168 tree
11169 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
11170 {
11171   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
11172   return build_truth_vector_type_for_mode (nunits, new_mode);
11173 }
11174
11175 /* Return a mask type with twice as many elements as OLD_TYPE,
11176    given that it should have mode NEW_MODE.  */
11177
11178 tree
11179 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
11180 {
11181   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
11182   return build_truth_vector_type_for_mode (nunits, new_mode);
11183 }
11184
11185 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
11186    contain a sequence of NVECTORS masks that each control a vector of type
11187    VECTYPE.  If SCALAR_MASK is nonnull, the fully-masked loop would AND
11188    these vector masks with the vector version of SCALAR_MASK.  */
11189
11190 void
11191 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
11192                        unsigned int nvectors, tree vectype, tree scalar_mask)
11193 {
11194   gcc_assert (nvectors != 0);
11195
11196   if (scalar_mask)
11197     {
11198       scalar_cond_masked_key cond (scalar_mask, nvectors);
11199       loop_vinfo->scalar_cond_masked_set.add (cond);
11200     }
11201
11202   masks->mask_set.add (std::make_pair (vectype, nvectors));
11203 }
11204
11205 /* Given a complete set of masks MASKS, extract mask number INDEX
11206    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
11207    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
11208
11209    See the comment above vec_loop_masks for more details about the mask
11210    arrangement.  */
11211
11212 tree
11213 vect_get_loop_mask (loop_vec_info loop_vinfo,
11214                     gimple_stmt_iterator *gsi, vec_loop_masks *masks,
11215                     unsigned int nvectors, tree vectype, unsigned int index)
11216 {
11217   if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
11218       == vect_partial_vectors_while_ult)
11219     {
11220       rgroup_controls *rgm = &(masks->rgc_vec)[nvectors - 1];
11221       tree mask_type = rgm->type;
11222
11223       /* Populate the rgroup's mask array, if this is the first time we've
11224          used it.  */
11225       if (rgm->controls.is_empty ())
11226         {
11227           rgm->controls.safe_grow_cleared (nvectors, true);
11228           for (unsigned int i = 0; i < nvectors; ++i)
11229             {
11230               tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
11231               /* Provide a dummy definition until the real one is available.  */
11232               SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
11233               rgm->controls[i] = mask;
11234             }
11235         }
11236
11237       tree mask = rgm->controls[index];
11238       if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
11239                     TYPE_VECTOR_SUBPARTS (vectype)))
11240         {
11241           /* A loop mask for data type X can be reused for data type Y
11242              if X has N times more elements than Y and if Y's elements
11243              are N times bigger than X's.  In this case each sequence
11244              of N elements in the loop mask will be all-zero or all-one.
11245              We can then view-convert the mask so that each sequence of
11246              N elements is replaced by a single element.  */
11247           gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
11248                                   TYPE_VECTOR_SUBPARTS (vectype)));
11249           gimple_seq seq = NULL;
11250           mask_type = truth_type_for (vectype);
11251           mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
11252           if (seq)
11253             gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11254         }
11255       return mask;
11256     }
11257   else if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
11258            == vect_partial_vectors_avx512)
11259     {
11260       /* The number of scalars per iteration and the number of vectors are
11261          both compile-time constants.  */
11262       unsigned int nscalars_per_iter
11263         = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11264                      LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
11265
11266       rgroup_controls *rgm = &masks->rgc_vec[nscalars_per_iter - 1];
11267
11268       /* The stored nV is dependent on the mask type produced.  */
11269       gcc_assert (exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11270                              TYPE_VECTOR_SUBPARTS (rgm->type)).to_constant ()
11271                   == rgm->factor);
11272       nvectors = rgm->factor;
11273
11274       /* Populate the rgroup's mask array, if this is the first time we've
11275          used it.  */
11276       if (rgm->controls.is_empty ())
11277         {
11278           rgm->controls.safe_grow_cleared (nvectors, true);
11279           for (unsigned int i = 0; i < nvectors; ++i)
11280             {
11281               tree mask = make_temp_ssa_name (rgm->type, NULL, "loop_mask");
11282               /* Provide a dummy definition until the real one is available.  */
11283               SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
11284               rgm->controls[i] = mask;
11285             }
11286         }
11287       if (known_eq (TYPE_VECTOR_SUBPARTS (rgm->type),
11288                     TYPE_VECTOR_SUBPARTS (vectype)))
11289         return rgm->controls[index];
11290
11291       /* Split the vector if needed.  Since we are dealing with integer mode
11292          masks with AVX512 we can operate on the integer representation
11293          performing the whole vector shifting.  */
11294       unsigned HOST_WIDE_INT factor;
11295       bool ok = constant_multiple_p (TYPE_VECTOR_SUBPARTS (rgm->type),
11296                                      TYPE_VECTOR_SUBPARTS (vectype), &factor);
11297       gcc_assert (ok);
11298       gcc_assert (GET_MODE_CLASS (TYPE_MODE (rgm->type)) == MODE_INT);
11299       tree mask_type = truth_type_for (vectype);
11300       gcc_assert (GET_MODE_CLASS (TYPE_MODE (mask_type)) == MODE_INT);
11301       unsigned vi = index / factor;
11302       unsigned vpart = index % factor;
11303       tree vec = rgm->controls[vi];
11304       gimple_seq seq = NULL;
11305       vec = gimple_build (&seq, VIEW_CONVERT_EXPR,
11306                           lang_hooks.types.type_for_mode
11307                                 (TYPE_MODE (rgm->type), 1), vec);
11308       /* For integer mode masks simply shift the right bits into position.  */
11309       if (vpart != 0)
11310         vec = gimple_build (&seq, RSHIFT_EXPR, TREE_TYPE (vec), vec,
11311                             build_int_cst (integer_type_node,
11312                                            (TYPE_VECTOR_SUBPARTS (vectype)
11313                                             * vpart)));
11314       vec = gimple_convert (&seq, lang_hooks.types.type_for_mode
11315                                     (TYPE_MODE (mask_type), 1), vec);
11316       vec = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, vec);
11317       if (seq)
11318         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11319       return vec;
11320     }
11321   else
11322     gcc_unreachable ();
11323 }
11324
11325 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
11326    lengths for controlling an operation on VECTYPE.  The operation splits
11327    each element of VECTYPE into FACTOR separate subelements, measuring the
11328    length as a number of these subelements.  */
11329
11330 void
11331 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
11332                       unsigned int nvectors, tree vectype, unsigned int factor)
11333 {
11334   gcc_assert (nvectors != 0);
11335   if (lens->length () < nvectors)
11336     lens->safe_grow_cleared (nvectors, true);
11337   rgroup_controls *rgl = &(*lens)[nvectors - 1];
11338
11339   /* The number of scalars per iteration, scalar occupied bytes and
11340      the number of vectors are both compile-time constants.  */
11341   unsigned int nscalars_per_iter
11342     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11343                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
11344
11345   if (rgl->max_nscalars_per_iter < nscalars_per_iter)
11346     {
11347       /* For now, we only support cases in which all loads and stores fall back
11348          to VnQI or none do.  */
11349       gcc_assert (!rgl->max_nscalars_per_iter
11350                   || (rgl->factor == 1 && factor == 1)
11351                   || (rgl->max_nscalars_per_iter * rgl->factor
11352                       == nscalars_per_iter * factor));
11353       rgl->max_nscalars_per_iter = nscalars_per_iter;
11354       rgl->type = vectype;
11355       rgl->factor = factor;
11356     }
11357 }
11358
11359 /* Given a complete set of lengths LENS, extract length number INDEX
11360    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
11361    where 0 <= INDEX < NVECTORS.  Return a value that contains FACTOR
11362    multipled by the number of elements that should be processed.
11363    Insert any set-up statements before GSI.  */
11364
11365 tree
11366 vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
11367                    vec_loop_lens *lens, unsigned int nvectors, tree vectype,
11368                    unsigned int index, unsigned int factor)
11369 {
11370   rgroup_controls *rgl = &(*lens)[nvectors - 1];
11371   bool use_bias_adjusted_len =
11372     LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
11373
11374   /* Populate the rgroup's len array, if this is the first time we've
11375      used it.  */
11376   if (rgl->controls.is_empty ())
11377     {
11378       rgl->controls.safe_grow_cleared (nvectors, true);
11379       for (unsigned int i = 0; i < nvectors; ++i)
11380         {
11381           tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
11382           gcc_assert (len_type != NULL_TREE);
11383
11384           tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
11385
11386           /* Provide a dummy definition until the real one is available.  */
11387           SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
11388           rgl->controls[i] = len;
11389
11390           if (use_bias_adjusted_len)
11391             {
11392               gcc_assert (i == 0);
11393               tree adjusted_len =
11394                 make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
11395               SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
11396               rgl->bias_adjusted_ctrl = adjusted_len;
11397             }
11398         }
11399     }
11400
11401   if (use_bias_adjusted_len)
11402     return rgl->bias_adjusted_ctrl;
11403
11404   tree loop_len = rgl->controls[index];
11405   if (rgl->factor == 1 && factor == 1)
11406     {
11407       poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
11408       poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
11409       if (maybe_ne (nunits1, nunits2))
11410         {
11411           /* A loop len for data type X can be reused for data type Y
11412              if X has N times more elements than Y and if Y's elements
11413              are N times bigger than X's.  */
11414           gcc_assert (multiple_p (nunits1, nunits2));
11415           factor = exact_div (nunits1, nunits2).to_constant ();
11416           tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
11417           gimple_seq seq = NULL;
11418           loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
11419                                    build_int_cst (iv_type, factor));
11420           if (seq)
11421             gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11422         }
11423     }
11424   return loop_len;
11425 }
11426
11427 /* Scale profiling counters by estimation for LOOP which is vectorized
11428    by factor VF.
11429    If FLAT is true, the loop we started with had unrealistically flat
11430    profile.  */
11431
11432 static void
11433 scale_profile_for_vect_loop (class loop *loop, edge exit_e, unsigned vf, bool flat)
11434 {
11435   /* For flat profiles do not scale down proportionally by VF and only
11436      cap by known iteration count bounds.  */
11437   if (flat)
11438     {
11439       if (dump_file && (dump_flags & TDF_DETAILS))
11440         fprintf (dump_file,
11441                  "Vectorized loop profile seems flat; not scaling iteration "
11442                  "count down by the vectorization factor %i\n", vf);
11443       scale_loop_profile (loop, profile_probability::always (),
11444                           get_likely_max_loop_iterations_int (loop));
11445       return;
11446     }
11447   /* Loop body executes VF fewer times and exit increases VF times.  */
11448   profile_count entry_count = loop_preheader_edge (loop)->count ();
11449
11450   /* If we have unreliable loop profile avoid dropping entry
11451      count bellow header count.  This can happen since loops
11452      has unrealistically low trip counts.  */
11453   while (vf > 1
11454          && loop->header->count > entry_count
11455          && loop->header->count < entry_count * vf)
11456     {
11457       if (dump_file && (dump_flags & TDF_DETAILS))
11458         fprintf (dump_file,
11459                  "Vectorization factor %i seems too large for profile "
11460                  "prevoiusly believed to be consistent; reducing.\n", vf);
11461       vf /= 2;
11462     }
11463
11464   if (entry_count.nonzero_p ())
11465     set_edge_probability_and_rescale_others
11466             (exit_e,
11467              entry_count.probability_in (loop->header->count / vf));
11468   /* Avoid producing very large exit probability when we do not have
11469      sensible profile.  */
11470   else if (exit_e->probability < profile_probability::always () / (vf * 2))
11471     set_edge_probability_and_rescale_others (exit_e, exit_e->probability * vf);
11472   loop->latch->count = single_pred_edge (loop->latch)->count ();
11473
11474   scale_loop_profile (loop, profile_probability::always () / vf,
11475                       get_likely_max_loop_iterations_int (loop));
11476 }
11477
11478 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
11479    latch edge values originally defined by it.  */
11480
11481 static void
11482 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
11483                                      stmt_vec_info def_stmt_info)
11484 {
11485   tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
11486   if (!def || TREE_CODE (def) != SSA_NAME)
11487     return;
11488   stmt_vec_info phi_info;
11489   imm_use_iterator iter;
11490   use_operand_p use_p;
11491   FOR_EACH_IMM_USE_FAST (use_p, iter, def)
11492     {
11493       gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p));
11494       if (!phi)
11495         continue;
11496       if (!(gimple_bb (phi)->loop_father->header == gimple_bb (phi)
11497             && (phi_info = loop_vinfo->lookup_stmt (phi))
11498             && STMT_VINFO_RELEVANT_P (phi_info)))
11499         continue;
11500       loop_p loop = gimple_bb (phi)->loop_father;
11501       edge e = loop_latch_edge (loop);
11502       if (PHI_ARG_DEF_FROM_EDGE (phi, e) != def)
11503         continue;
11504
11505       if (VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
11506           && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
11507           && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
11508         {
11509           vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11510           vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11511           gcc_assert (phi_defs.length () == latch_defs.length ());
11512           for (unsigned i = 0; i < phi_defs.length (); ++i)
11513             add_phi_arg (as_a <gphi *> (phi_defs[i]),
11514                          gimple_get_lhs (latch_defs[i]), e,
11515                          gimple_phi_arg_location (phi, e->dest_idx));
11516         }
11517       else if (STMT_VINFO_DEF_TYPE (phi_info) == vect_first_order_recurrence)
11518         {
11519           /* For first order recurrences we have to update both uses of
11520              the latch definition, the one in the PHI node and the one
11521              in the generated VEC_PERM_EXPR.  */
11522           vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11523           vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11524           gcc_assert (phi_defs.length () == latch_defs.length ());
11525           tree phidef = gimple_assign_rhs1 (phi_defs[0]);
11526           gphi *vphi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
11527           for (unsigned i = 0; i < phi_defs.length (); ++i)
11528             {
11529               gassign *perm = as_a <gassign *> (phi_defs[i]);
11530               if (i > 0)
11531                 gimple_assign_set_rhs1 (perm, gimple_get_lhs (latch_defs[i-1]));
11532               gimple_assign_set_rhs2 (perm, gimple_get_lhs (latch_defs[i]));
11533               update_stmt (perm);
11534             }
11535           add_phi_arg (vphi, gimple_get_lhs (latch_defs.last ()), e,
11536                        gimple_phi_arg_location (phi, e->dest_idx));
11537         }
11538     }
11539 }
11540
11541 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
11542    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
11543    stmt_vec_info.  */
11544
11545 static bool
11546 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
11547                           gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
11548 {
11549   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11550   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11551
11552   if (dump_enabled_p ())
11553     dump_printf_loc (MSG_NOTE, vect_location,
11554                      "------>vectorizing statement: %G", stmt_info->stmt);
11555
11556   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
11557     vect_loop_kill_debug_uses (loop, stmt_info);
11558
11559   if (!STMT_VINFO_RELEVANT_P (stmt_info)
11560       && !STMT_VINFO_LIVE_P (stmt_info))
11561     {
11562       if (is_gimple_call (stmt_info->stmt)
11563           && gimple_call_internal_p (stmt_info->stmt, IFN_MASK_CALL))
11564         {
11565           gcc_assert (!gimple_call_lhs (stmt_info->stmt));
11566           *seen_store = stmt_info;
11567           return false;
11568         }
11569       return false;
11570     }
11571
11572   if (STMT_VINFO_VECTYPE (stmt_info))
11573     {
11574       poly_uint64 nunits
11575         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
11576       if (!STMT_SLP_TYPE (stmt_info)
11577           && maybe_ne (nunits, vf)
11578           && dump_enabled_p ())
11579         /* For SLP VF is set according to unrolling factor, and not
11580            to vector size, hence for SLP this print is not valid.  */
11581         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
11582     }
11583
11584   /* Pure SLP statements have already been vectorized.  We still need
11585      to apply loop vectorization to hybrid SLP statements.  */
11586   if (PURE_SLP_STMT (stmt_info))
11587     return false;
11588
11589   if (dump_enabled_p ())
11590     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
11591
11592   if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
11593     *seen_store = stmt_info;
11594
11595   return true;
11596 }
11597
11598 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
11599    in the hash_map with its corresponding values.  */
11600
11601 static tree
11602 find_in_mapping (tree t, void *context)
11603 {
11604   hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
11605
11606   tree *value = mapping->get (t);
11607   return value ? *value : t;
11608 }
11609
11610 /* Update EPILOGUE's loop_vec_info.  EPILOGUE was constructed as a copy of the
11611    original loop that has now been vectorized.
11612
11613    The inits of the data_references need to be advanced with the number of
11614    iterations of the main loop.  This has been computed in vect_do_peeling and
11615    is stored in parameter ADVANCE.  We first restore the data_references
11616    initial offset with the values recored in ORIG_DRS_INIT.
11617
11618    Since the loop_vec_info of this EPILOGUE was constructed for the original
11619    loop, its stmt_vec_infos all point to the original statements.  These need
11620    to be updated to point to their corresponding copies as well as the SSA_NAMES
11621    in their PATTERN_DEF_SEQs and RELATED_STMTs.
11622
11623    The data_reference's connections also need to be updated.  Their
11624    corresponding dr_vec_info need to be reconnected to the EPILOGUE's
11625    stmt_vec_infos, their statements need to point to their corresponding copy,
11626    if they are gather loads or scatter stores then their reference needs to be
11627    updated to point to its corresponding copy and finally we set
11628    'base_misaligned' to false as we have already peeled for alignment in the
11629    prologue of the main loop.  */
11630
11631 static void
11632 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
11633 {
11634   loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
11635   auto_vec<gimple *> stmt_worklist;
11636   hash_map<tree,tree> mapping;
11637   gimple *orig_stmt, *new_stmt;
11638   gimple_stmt_iterator epilogue_gsi;
11639   gphi_iterator epilogue_phi_gsi;
11640   stmt_vec_info stmt_vinfo = NULL, related_vinfo;
11641   basic_block *epilogue_bbs = get_loop_body (epilogue);
11642   unsigned i;
11643
11644   free (LOOP_VINFO_BBS (epilogue_vinfo));
11645   LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
11646
11647   /* Advance data_reference's with the number of iterations of the previous
11648      loop and its prologue.  */
11649   vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
11650
11651
11652   /* The EPILOGUE loop is a copy of the original loop so they share the same
11653      gimple UIDs.  In this loop we update the loop_vec_info of the EPILOGUE to
11654      point to the copied statements.  We also create a mapping of all LHS' in
11655      the original loop and all the LHS' in the EPILOGUE and create worklists to
11656      update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs.  */
11657   for (unsigned i = 0; i < epilogue->num_nodes; ++i)
11658     {
11659       for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
11660            !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
11661         {
11662           new_stmt = epilogue_phi_gsi.phi ();
11663
11664           gcc_assert (gimple_uid (new_stmt) > 0);
11665           stmt_vinfo
11666             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11667
11668           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11669           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11670
11671           mapping.put (gimple_phi_result (orig_stmt),
11672                        gimple_phi_result (new_stmt));
11673           /* PHI nodes can not have patterns or related statements.  */
11674           gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
11675                       && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
11676         }
11677
11678       for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
11679            !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
11680         {
11681           new_stmt = gsi_stmt (epilogue_gsi);
11682           if (is_gimple_debug (new_stmt))
11683             continue;
11684
11685           gcc_assert (gimple_uid (new_stmt) > 0);
11686           stmt_vinfo
11687             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11688
11689           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11690           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11691
11692           if (tree old_lhs = gimple_get_lhs (orig_stmt))
11693             mapping.put (old_lhs, gimple_get_lhs (new_stmt));
11694
11695           if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
11696             {
11697               gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
11698               for (gimple_stmt_iterator gsi = gsi_start (seq);
11699                    !gsi_end_p (gsi); gsi_next (&gsi))
11700                 stmt_worklist.safe_push (gsi_stmt (gsi));
11701             }
11702
11703           related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
11704           if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
11705             {
11706               gimple *stmt = STMT_VINFO_STMT (related_vinfo);
11707               stmt_worklist.safe_push (stmt);
11708               /* Set BB such that the assert in
11709                 'get_initial_def_for_reduction' is able to determine that
11710                 the BB of the related stmt is inside this loop.  */
11711               gimple_set_bb (stmt,
11712                              gimple_bb (new_stmt));
11713               related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
11714               gcc_assert (related_vinfo == NULL
11715                           || related_vinfo == stmt_vinfo);
11716             }
11717         }
11718     }
11719
11720   /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
11721      using the original main loop and thus need to be updated to refer to the
11722      cloned variables used in the epilogue.  */
11723   for (unsigned i = 0; i < stmt_worklist.length (); ++i)
11724     {
11725       gimple *stmt = stmt_worklist[i];
11726       tree *new_op;
11727
11728       for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
11729         {
11730           tree op = gimple_op (stmt, j);
11731           if ((new_op = mapping.get(op)))
11732             gimple_set_op (stmt, j, *new_op);
11733           else
11734             {
11735               /* PR92429: The last argument of simplify_replace_tree disables
11736                  folding when replacing arguments.  This is required as
11737                  otherwise you might end up with different statements than the
11738                  ones analyzed in vect_loop_analyze, leading to different
11739                  vectorization.  */
11740               op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
11741                                           &find_in_mapping, &mapping, false);
11742               gimple_set_op (stmt, j, op);
11743             }
11744         }
11745     }
11746
11747   struct data_reference *dr;
11748   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
11749   FOR_EACH_VEC_ELT (datarefs, i, dr)
11750     {
11751       orig_stmt = DR_STMT (dr);
11752       gcc_assert (gimple_uid (orig_stmt) > 0);
11753       stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
11754       /* Data references for gather loads and scatter stores do not use the
11755          updated offset we set using ADVANCE.  Instead we have to make sure the
11756          reference in the data references point to the corresponding copy of
11757          the original in the epilogue.  Make sure to update both
11758          gather/scatters recognized by dataref analysis and also other
11759          refs that get_load_store_type classified as VMAT_GATHER_SCATTER.  */
11760       auto vstmt_vinfo = vect_stmt_to_vectorize (stmt_vinfo);
11761       if (STMT_VINFO_MEMORY_ACCESS_TYPE (vstmt_vinfo) == VMAT_GATHER_SCATTER
11762           || STMT_VINFO_GATHER_SCATTER_P (vstmt_vinfo))
11763         {
11764           DR_REF (dr)
11765             = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
11766                                      &find_in_mapping, &mapping);
11767           DR_BASE_ADDRESS (dr)
11768             = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
11769                                      &find_in_mapping, &mapping);
11770         }
11771       DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
11772       stmt_vinfo->dr_aux.stmt = stmt_vinfo;
11773       /* The vector size of the epilogue is smaller than that of the main loop
11774          so the alignment is either the same or lower. This means the dr will
11775          thus by definition be aligned.  */
11776       STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
11777     }
11778
11779   epilogue_vinfo->shared->datarefs_copy.release ();
11780   epilogue_vinfo->shared->save_datarefs ();
11781 }
11782
11783 /*  When vectorizing early break statements instructions that happen before
11784     the early break in the current BB need to be moved to after the early
11785     break.  This function deals with that and assumes that any validity
11786     checks has already been performed.
11787
11788     While moving the instructions if it encounters a VUSE or VDEF it then
11789     corrects the VUSES as it moves the statements along.  GDEST is the location
11790     in which to insert the new statements.  */
11791
11792 static void
11793 move_early_exit_stmts (loop_vec_info loop_vinfo)
11794 {
11795   DUMP_VECT_SCOPE ("move_early_exit_stmts");
11796
11797   if (LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).is_empty ())
11798     return;
11799
11800   /* Move all stmts that need moving.  */
11801   basic_block dest_bb = LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo);
11802   gimple_stmt_iterator dest_gsi = gsi_after_labels (dest_bb);
11803
11804   tree last_seen_vuse = NULL_TREE;
11805   for (gimple *stmt : LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo))
11806     {
11807       /* We have to update crossed degenerate virtual PHIs.  Simply
11808          elide them.  */
11809       if (gphi *vphi = dyn_cast <gphi *> (stmt))
11810         {
11811           tree vdef = gimple_phi_result (vphi);
11812           tree vuse = gimple_phi_arg_def (vphi, 0);
11813           imm_use_iterator iter;
11814           use_operand_p use_p;
11815           gimple *use_stmt;
11816           FOR_EACH_IMM_USE_STMT (use_stmt, iter, vdef)
11817             {
11818               FOR_EACH_IMM_USE_ON_STMT (use_p, iter)
11819                 SET_USE (use_p, vuse);
11820             }
11821           auto gsi = gsi_for_stmt (stmt);
11822           remove_phi_node (&gsi, true);
11823           last_seen_vuse = vuse;
11824           continue;
11825         }
11826
11827       /* Check to see if statement is still required for vect or has been
11828          elided.  */
11829       auto stmt_info = loop_vinfo->lookup_stmt (stmt);
11830       if (!stmt_info)
11831         continue;
11832
11833       if (dump_enabled_p ())
11834         dump_printf_loc (MSG_NOTE, vect_location, "moving stmt %G", stmt);
11835
11836       gimple_stmt_iterator stmt_gsi = gsi_for_stmt (stmt);
11837       gsi_move_before (&stmt_gsi, &dest_gsi, GSI_NEW_STMT);
11838       last_seen_vuse = gimple_vuse (stmt);
11839     }
11840
11841   /* Update all the stmts with their new reaching VUSES.  */
11842   for (auto p : LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo))
11843     {
11844       if (dump_enabled_p ())
11845           dump_printf_loc (MSG_NOTE, vect_location,
11846                            "updating vuse to %T for load %G",
11847                            last_seen_vuse, p);
11848       gimple_set_vuse (p, last_seen_vuse);
11849       update_stmt (p);
11850     }
11851
11852   /* And update the LC PHIs on exits.  */
11853   for (edge e : get_loop_exit_edges (LOOP_VINFO_LOOP  (loop_vinfo)))
11854     if (!dominated_by_p (CDI_DOMINATORS, e->src, dest_bb))
11855       if (gphi *phi = get_virtual_phi (e->dest))
11856         SET_PHI_ARG_DEF_ON_EDGE (phi, e, last_seen_vuse);
11857 }
11858
11859 /* Function vect_transform_loop.
11860
11861    The analysis phase has determined that the loop is vectorizable.
11862    Vectorize the loop - created vectorized stmts to replace the scalar
11863    stmts in the loop, and update the loop exit condition.
11864    Returns scalar epilogue loop if any.  */
11865
11866 class loop *
11867 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
11868 {
11869   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11870   class loop *epilogue = NULL;
11871   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
11872   int nbbs = loop->num_nodes;
11873   int i;
11874   tree niters_vector = NULL_TREE;
11875   tree step_vector = NULL_TREE;
11876   tree niters_vector_mult_vf = NULL_TREE;
11877   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11878   unsigned int lowest_vf = constant_lower_bound (vf);
11879   gimple *stmt;
11880   bool check_profitability = false;
11881   unsigned int th;
11882   bool flat = maybe_flat_loop_profile (loop);
11883
11884   DUMP_VECT_SCOPE ("vec_transform_loop");
11885
11886   loop_vinfo->shared->check_datarefs ();
11887
11888   /* Use the more conservative vectorization threshold.  If the number
11889      of iterations is constant assume the cost check has been performed
11890      by our caller.  If the threshold makes all loops profitable that
11891      run at least the (estimated) vectorization factor number of times
11892      checking is pointless, too.  */
11893   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
11894   if (vect_apply_runtime_profitability_check_p (loop_vinfo))
11895     {
11896       if (dump_enabled_p ())
11897         dump_printf_loc (MSG_NOTE, vect_location,
11898                          "Profitability threshold is %d loop iterations.\n",
11899                          th);
11900       check_profitability = true;
11901     }
11902
11903   /* Make sure there exists a single-predecessor exit bb.  Do this before
11904      versioning.   */
11905   edge e = LOOP_VINFO_IV_EXIT (loop_vinfo);
11906   if (! single_pred_p (e->dest) && !LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11907     {
11908       split_loop_exit_edge (e, true);
11909       if (dump_enabled_p ())
11910         dump_printf (MSG_NOTE, "split exit edge\n");
11911     }
11912
11913   /* Version the loop first, if required, so the profitability check
11914      comes first.  */
11915
11916   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
11917     {
11918       class loop *sloop
11919         = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
11920       sloop->force_vectorize = false;
11921       check_profitability = false;
11922     }
11923
11924   /* Make sure there exists a single-predecessor exit bb also on the
11925      scalar loop copy.  Do this after versioning but before peeling
11926      so CFG structure is fine for both scalar and if-converted loop
11927      to make slpeel_duplicate_current_defs_from_edges face matched
11928      loop closed PHI nodes on the exit.  */
11929   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
11930     {
11931       e = LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo);
11932       if (! single_pred_p (e->dest))
11933         {
11934           split_loop_exit_edge (e, true);
11935           if (dump_enabled_p ())
11936             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
11937         }
11938     }
11939
11940   tree niters = vect_build_loop_niters (loop_vinfo);
11941   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
11942   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
11943   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
11944   tree advance;
11945   drs_init_vec orig_drs_init;
11946
11947   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
11948                               &step_vector, &niters_vector_mult_vf, th,
11949                               check_profitability, niters_no_overflow,
11950                               &advance);
11951   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
11952       && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
11953     {
11954       /* Ifcvt duplicates loop preheader, loop body and produces an basic
11955          block after loop exit.  We need to scale all that.  */
11956       basic_block preheader
11957         = loop_preheader_edge (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->src;
11958       preheader->count
11959         = preheader->count.apply_probability
11960               (LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11961       scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
11962                               LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11963       LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo)->dest->count = preheader->count;
11964     }
11965
11966   if (niters_vector == NULL_TREE)
11967     {
11968       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
11969           && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
11970           && known_eq (lowest_vf, vf))
11971         {
11972           niters_vector
11973             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
11974                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
11975           step_vector = build_one_cst (TREE_TYPE (niters));
11976         }
11977       else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11978         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
11979                                      &step_vector, niters_no_overflow);
11980       else
11981         /* vect_do_peeling subtracted the number of peeled prologue
11982            iterations from LOOP_VINFO_NITERS.  */
11983         vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
11984                                      &niters_vector, &step_vector,
11985                                      niters_no_overflow);
11986     }
11987
11988   /* 1) Make sure the loop header has exactly two entries
11989      2) Make sure we have a preheader basic block.  */
11990
11991   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
11992
11993   split_edge (loop_preheader_edge (loop));
11994
11995   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11996     /* This will deal with any possible peeling.  */
11997     vect_prepare_for_masked_peels (loop_vinfo);
11998
11999   /* Handle any code motion that we need to for early-break vectorization after
12000      we've done peeling but just before we start vectorizing.  */
12001   if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
12002     move_early_exit_stmts (loop_vinfo);
12003
12004   /* Schedule the SLP instances first, then handle loop vectorization
12005      below.  */
12006   if (!loop_vinfo->slp_instances.is_empty ())
12007     {
12008       DUMP_VECT_SCOPE ("scheduling SLP instances");
12009       vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
12010     }
12011
12012   /* FORNOW: the vectorizer supports only loops which body consist
12013      of one basic block (header + empty latch). When the vectorizer will
12014      support more involved loop forms, the order by which the BBs are
12015      traversed need to be reconsidered.  */
12016
12017   for (i = 0; i < nbbs; i++)
12018     {
12019       basic_block bb = bbs[i];
12020       stmt_vec_info stmt_info;
12021
12022       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
12023            gsi_next (&si))
12024         {
12025           gphi *phi = si.phi ();
12026           if (dump_enabled_p ())
12027             dump_printf_loc (MSG_NOTE, vect_location,
12028                              "------>vectorizing phi: %G", (gimple *) phi);
12029           stmt_info = loop_vinfo->lookup_stmt (phi);
12030           if (!stmt_info)
12031             continue;
12032
12033           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
12034             vect_loop_kill_debug_uses (loop, stmt_info);
12035
12036           if (!STMT_VINFO_RELEVANT_P (stmt_info)
12037               && !STMT_VINFO_LIVE_P (stmt_info))
12038             continue;
12039
12040           if (STMT_VINFO_VECTYPE (stmt_info)
12041               && (maybe_ne
12042                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
12043               && dump_enabled_p ())
12044             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
12045
12046           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
12047                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
12048                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
12049                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
12050                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence
12051                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
12052               && ! PURE_SLP_STMT (stmt_info))
12053             {
12054               if (dump_enabled_p ())
12055                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
12056               vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
12057             }
12058         }
12059
12060       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
12061            gsi_next (&si))
12062         {
12063           gphi *phi = si.phi ();
12064           stmt_info = loop_vinfo->lookup_stmt (phi);
12065           if (!stmt_info)
12066             continue;
12067
12068           if (!STMT_VINFO_RELEVANT_P (stmt_info)
12069               && !STMT_VINFO_LIVE_P (stmt_info))
12070             continue;
12071
12072           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
12073                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
12074                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
12075                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
12076                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
12077                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence)
12078               && ! PURE_SLP_STMT (stmt_info))
12079             maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
12080         }
12081
12082       for (gimple_stmt_iterator si = gsi_start_bb (bb);
12083            !gsi_end_p (si);)
12084         {
12085           stmt = gsi_stmt (si);
12086           /* During vectorization remove existing clobber stmts.  */
12087           if (gimple_clobber_p (stmt))
12088             {
12089               unlink_stmt_vdef (stmt);
12090               gsi_remove (&si, true);
12091               release_defs (stmt);
12092             }
12093           else
12094             {
12095               /* Ignore vector stmts created in the outer loop.  */
12096               stmt_info = loop_vinfo->lookup_stmt (stmt);
12097
12098               /* vector stmts created in the outer-loop during vectorization of
12099                  stmts in an inner-loop may not have a stmt_info, and do not
12100                  need to be vectorized.  */
12101               stmt_vec_info seen_store = NULL;
12102               if (stmt_info)
12103                 {
12104                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
12105                     {
12106                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
12107                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
12108                            !gsi_end_p (subsi); gsi_next (&subsi))
12109                         {
12110                           stmt_vec_info pat_stmt_info
12111                             = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
12112                           vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
12113                                                     &si, &seen_store);
12114                         }
12115                       stmt_vec_info pat_stmt_info
12116                         = STMT_VINFO_RELATED_STMT (stmt_info);
12117                       if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
12118                                                     &si, &seen_store))
12119                         maybe_set_vectorized_backedge_value (loop_vinfo,
12120                                                              pat_stmt_info);
12121                     }
12122                   else
12123                     {
12124                       if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
12125                                                     &seen_store))
12126                         maybe_set_vectorized_backedge_value (loop_vinfo,
12127                                                              stmt_info);
12128                     }
12129                 }
12130               gsi_next (&si);
12131               if (seen_store)
12132                 {
12133                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
12134                     /* Interleaving.  If IS_STORE is TRUE, the
12135                        vectorization of the interleaving chain was
12136                        completed - free all the stores in the chain.  */
12137                     vect_remove_stores (loop_vinfo,
12138                                         DR_GROUP_FIRST_ELEMENT (seen_store));
12139                   else
12140                     /* Free the attached stmt_vec_info and remove the stmt.  */
12141                     loop_vinfo->remove_stmt (stmt_info);
12142                 }
12143             }
12144         }
12145
12146       /* Stub out scalar statements that must not survive vectorization.
12147          Doing this here helps with grouped statements, or statements that
12148          are involved in patterns.  */
12149       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
12150            !gsi_end_p (gsi); gsi_next (&gsi))
12151         {
12152           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
12153           if (!call || !gimple_call_internal_p (call))
12154             continue;
12155           internal_fn ifn = gimple_call_internal_fn (call);
12156           if (ifn == IFN_MASK_LOAD)
12157             {
12158               tree lhs = gimple_get_lhs (call);
12159               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12160                 {
12161                   tree zero = build_zero_cst (TREE_TYPE (lhs));
12162                   gimple *new_stmt = gimple_build_assign (lhs, zero);
12163                   gsi_replace (&gsi, new_stmt, true);
12164                 }
12165             }
12166           else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
12167             {
12168               tree lhs = gimple_get_lhs (call);
12169               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12170                 {
12171                   tree else_arg
12172                     = gimple_call_arg (call, gimple_call_num_args (call) - 1);
12173                   gimple *new_stmt = gimple_build_assign (lhs, else_arg);
12174                   gsi_replace (&gsi, new_stmt, true);
12175                 }
12176             }
12177         }
12178     }                           /* BBs in loop */
12179
12180   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
12181      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
12182   if (integer_onep (step_vector))
12183     niters_no_overflow = true;
12184   vect_set_loop_condition (loop, LOOP_VINFO_IV_EXIT (loop_vinfo), loop_vinfo,
12185                            niters_vector, step_vector, niters_vector_mult_vf,
12186                            !niters_no_overflow);
12187
12188   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
12189
12190   /* True if the final iteration might not handle a full vector's
12191      worth of scalar iterations.  */
12192   bool final_iter_may_be_partial
12193     = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
12194       || LOOP_VINFO_EARLY_BREAKS (loop_vinfo);
12195   /* The minimum number of iterations performed by the epilogue.  This
12196      is 1 when peeling for gaps because we always need a final scalar
12197      iteration.  */
12198   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
12199   /* +1 to convert latch counts to loop iteration counts,
12200      -min_epilogue_iters to remove iterations that cannot be performed
12201        by the vector code.  */
12202   int bias_for_lowest = 1 - min_epilogue_iters;
12203   int bias_for_assumed = bias_for_lowest;
12204   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
12205   if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
12206     {
12207       /* When the amount of peeling is known at compile time, the first
12208          iteration will have exactly alignment_npeels active elements.
12209          In the worst case it will have at least one.  */
12210       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
12211       bias_for_lowest += lowest_vf - min_first_active;
12212       bias_for_assumed += assumed_vf - min_first_active;
12213     }
12214   /* In these calculations the "- 1" converts loop iteration counts
12215      back to latch counts.  */
12216   if (loop->any_upper_bound)
12217     {
12218       loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
12219       loop->nb_iterations_upper_bound
12220         = (final_iter_may_be_partial
12221            ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
12222                             lowest_vf) - 1
12223            : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
12224                              lowest_vf) - 1);
12225       if (main_vinfo
12226           /* Both peeling for alignment and peeling for gaps can end up
12227              with the scalar epilogue running for more than VF-1 iterations.  */
12228           && !main_vinfo->peeling_for_alignment
12229           && !main_vinfo->peeling_for_gaps)
12230         {
12231           unsigned int bound;
12232           poly_uint64 main_iters
12233             = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
12234                            LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
12235           main_iters
12236             = upper_bound (main_iters,
12237                            LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
12238           if (can_div_away_from_zero_p (main_iters,
12239                                         LOOP_VINFO_VECT_FACTOR (loop_vinfo),
12240                                         &bound))
12241             loop->nb_iterations_upper_bound
12242               = wi::umin ((bound_wide_int) (bound - 1),
12243                           loop->nb_iterations_upper_bound);
12244       }
12245   }
12246   if (loop->any_likely_upper_bound)
12247     loop->nb_iterations_likely_upper_bound
12248       = (final_iter_may_be_partial
12249          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
12250                           + bias_for_lowest, lowest_vf) - 1
12251          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
12252                            + bias_for_lowest, lowest_vf) - 1);
12253   if (loop->any_estimate)
12254     loop->nb_iterations_estimate
12255       = (final_iter_may_be_partial
12256          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
12257                           assumed_vf) - 1
12258          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
12259                            assumed_vf) - 1);
12260   scale_profile_for_vect_loop (loop, LOOP_VINFO_IV_EXIT (loop_vinfo),
12261                                assumed_vf, flat);
12262
12263   if (dump_enabled_p ())
12264     {
12265       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
12266         {
12267           dump_printf_loc (MSG_NOTE, vect_location,
12268                            "LOOP VECTORIZED\n");
12269           if (loop->inner)
12270             dump_printf_loc (MSG_NOTE, vect_location,
12271                              "OUTER LOOP VECTORIZED\n");
12272           dump_printf (MSG_NOTE, "\n");
12273         }
12274       else
12275         dump_printf_loc (MSG_NOTE, vect_location,
12276                          "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
12277                          GET_MODE_NAME (loop_vinfo->vector_mode));
12278     }
12279
12280   /* Loops vectorized with a variable factor won't benefit from
12281      unrolling/peeling.  */
12282   if (!vf.is_constant ())
12283     {
12284       loop->unroll = 1;
12285       if (dump_enabled_p ())
12286         dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
12287                          " variable-length vectorization factor\n");
12288     }
12289   /* Free SLP instances here because otherwise stmt reference counting
12290      won't work.  */
12291   slp_instance instance;
12292   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
12293     vect_free_slp_instance (instance);
12294   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
12295   /* Clear-up safelen field since its value is invalid after vectorization
12296      since vectorized loop can have loop-carried dependencies.  */
12297   loop->safelen = 0;
12298
12299   if (epilogue)
12300     {
12301       update_epilogue_loop_vinfo (epilogue, advance);
12302
12303       epilogue->simduid = loop->simduid;
12304       epilogue->force_vectorize = loop->force_vectorize;
12305       epilogue->dont_vectorize = false;
12306     }
12307
12308   return epilogue;
12309 }
12310
12311 /* The code below is trying to perform simple optimization - revert
12312    if-conversion for masked stores, i.e. if the mask of a store is zero
12313    do not perform it and all stored value producers also if possible.
12314    For example,
12315      for (i=0; i<n; i++)
12316        if (c[i])
12317         {
12318           p1[i] += 1;
12319           p2[i] = p3[i] +2;
12320         }
12321    this transformation will produce the following semi-hammock:
12322
12323    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
12324      {
12325        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
12326        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
12327        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
12328        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
12329        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
12330        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
12331      }
12332 */
12333
12334 void
12335 optimize_mask_stores (class loop *loop)
12336 {
12337   basic_block *bbs = get_loop_body (loop);
12338   unsigned nbbs = loop->num_nodes;
12339   unsigned i;
12340   basic_block bb;
12341   class loop *bb_loop;
12342   gimple_stmt_iterator gsi;
12343   gimple *stmt;
12344   auto_vec<gimple *> worklist;
12345   auto_purge_vect_location sentinel;
12346
12347   vect_location = find_loop_location (loop);
12348   /* Pick up all masked stores in loop if any.  */
12349   for (i = 0; i < nbbs; i++)
12350     {
12351       bb = bbs[i];
12352       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
12353            gsi_next (&gsi))
12354         {
12355           stmt = gsi_stmt (gsi);
12356           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
12357             worklist.safe_push (stmt);
12358         }
12359     }
12360
12361   free (bbs);
12362   if (worklist.is_empty ())
12363     return;
12364
12365   /* Loop has masked stores.  */
12366   while (!worklist.is_empty ())
12367     {
12368       gimple *last, *last_store;
12369       edge e, efalse;
12370       tree mask;
12371       basic_block store_bb, join_bb;
12372       gimple_stmt_iterator gsi_to;
12373       tree vdef, new_vdef;
12374       gphi *phi;
12375       tree vectype;
12376       tree zero;
12377
12378       last = worklist.pop ();
12379       mask = gimple_call_arg (last, 2);
12380       bb = gimple_bb (last);
12381       /* Create then_bb and if-then structure in CFG, then_bb belongs to
12382          the same loop as if_bb.  It could be different to LOOP when two
12383          level loop-nest is vectorized and mask_store belongs to the inner
12384          one.  */
12385       e = split_block (bb, last);
12386       bb_loop = bb->loop_father;
12387       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
12388       join_bb = e->dest;
12389       store_bb = create_empty_bb (bb);
12390       add_bb_to_loop (store_bb, bb_loop);
12391       e->flags = EDGE_TRUE_VALUE;
12392       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
12393       /* Put STORE_BB to likely part.  */
12394       efalse->probability = profile_probability::likely ();
12395       e->probability = efalse->probability.invert ();
12396       store_bb->count = efalse->count ();
12397       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
12398       if (dom_info_available_p (CDI_DOMINATORS))
12399         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
12400       if (dump_enabled_p ())
12401         dump_printf_loc (MSG_NOTE, vect_location,
12402                          "Create new block %d to sink mask stores.",
12403                          store_bb->index);
12404       /* Create vector comparison with boolean result.  */
12405       vectype = TREE_TYPE (mask);
12406       zero = build_zero_cst (vectype);
12407       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
12408       gsi = gsi_last_bb (bb);
12409       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
12410       /* Create new PHI node for vdef of the last masked store:
12411          .MEM_2 = VDEF <.MEM_1>
12412          will be converted to
12413          .MEM.3 = VDEF <.MEM_1>
12414          and new PHI node will be created in join bb
12415          .MEM_2 = PHI <.MEM_1, .MEM_3>
12416       */
12417       vdef = gimple_vdef (last);
12418       new_vdef = make_ssa_name (gimple_vop (cfun), last);
12419       gimple_set_vdef (last, new_vdef);
12420       phi = create_phi_node (vdef, join_bb);
12421       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
12422
12423       /* Put all masked stores with the same mask to STORE_BB if possible.  */
12424       while (true)
12425         {
12426           gimple_stmt_iterator gsi_from;
12427           gimple *stmt1 = NULL;
12428
12429           /* Move masked store to STORE_BB.  */
12430           last_store = last;
12431           gsi = gsi_for_stmt (last);
12432           gsi_from = gsi;
12433           /* Shift GSI to the previous stmt for further traversal.  */
12434           gsi_prev (&gsi);
12435           gsi_to = gsi_start_bb (store_bb);
12436           gsi_move_before (&gsi_from, &gsi_to);
12437           /* Setup GSI_TO to the non-empty block start.  */
12438           gsi_to = gsi_start_bb (store_bb);
12439           if (dump_enabled_p ())
12440             dump_printf_loc (MSG_NOTE, vect_location,
12441                              "Move stmt to created bb\n%G", last);
12442           /* Move all stored value producers if possible.  */
12443           while (!gsi_end_p (gsi))
12444             {
12445               tree lhs;
12446               imm_use_iterator imm_iter;
12447               use_operand_p use_p;
12448               bool res;
12449
12450               /* Skip debug statements.  */
12451               if (is_gimple_debug (gsi_stmt (gsi)))
12452                 {
12453                   gsi_prev (&gsi);
12454                   continue;
12455                 }
12456               stmt1 = gsi_stmt (gsi);
12457               /* Do not consider statements writing to memory or having
12458                  volatile operand.  */
12459               if (gimple_vdef (stmt1)
12460                   || gimple_has_volatile_ops (stmt1))
12461                 break;
12462               gsi_from = gsi;
12463               gsi_prev (&gsi);
12464               lhs = gimple_get_lhs (stmt1);
12465               if (!lhs)
12466                 break;
12467
12468               /* LHS of vectorized stmt must be SSA_NAME.  */
12469               if (TREE_CODE (lhs) != SSA_NAME)
12470                 break;
12471
12472               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12473                 {
12474                   /* Remove dead scalar statement.  */
12475                   if (has_zero_uses (lhs))
12476                     {
12477                       gsi_remove (&gsi_from, true);
12478                       continue;
12479                     }
12480                 }
12481
12482               /* Check that LHS does not have uses outside of STORE_BB.  */
12483               res = true;
12484               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
12485                 {
12486                   gimple *use_stmt;
12487                   use_stmt = USE_STMT (use_p);
12488                   if (is_gimple_debug (use_stmt))
12489                     continue;
12490                   if (gimple_bb (use_stmt) != store_bb)
12491                     {
12492                       res = false;
12493                       break;
12494                     }
12495                 }
12496               if (!res)
12497                 break;
12498
12499               if (gimple_vuse (stmt1)
12500                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
12501                 break;
12502
12503               /* Can move STMT1 to STORE_BB.  */
12504               if (dump_enabled_p ())
12505                 dump_printf_loc (MSG_NOTE, vect_location,
12506                                  "Move stmt to created bb\n%G", stmt1);
12507               gsi_move_before (&gsi_from, &gsi_to);
12508               /* Shift GSI_TO for further insertion.  */
12509               gsi_prev (&gsi_to);
12510             }
12511           /* Put other masked stores with the same mask to STORE_BB.  */
12512           if (worklist.is_empty ()
12513               || gimple_call_arg (worklist.last (), 2) != mask
12514               || worklist.last () != stmt1)
12515             break;
12516           last = worklist.pop ();
12517         }
12518       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
12519     }
12520 }
12521
12522 /* Decide whether it is possible to use a zero-based induction variable
12523    when vectorizing LOOP_VINFO with partial vectors.  If it is, return
12524    the value that the induction variable must be able to hold in order
12525    to ensure that the rgroups eventually have no active vector elements.
12526    Return -1 otherwise.  */
12527
12528 widest_int
12529 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
12530 {
12531   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
12532   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
12533   unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
12534
12535   /* Calculate the value that the induction variable must be able
12536      to hit in order to ensure that we end the loop with an all-false mask.
12537      This involves adding the maximum number of inactive trailing scalar
12538      iterations.  */
12539   widest_int iv_limit = -1;
12540   if (max_loop_iterations (loop, &iv_limit))
12541     {
12542       if (niters_skip)
12543         {
12544           /* Add the maximum number of skipped iterations to the
12545              maximum iteration count.  */
12546           if (TREE_CODE (niters_skip) == INTEGER_CST)
12547             iv_limit += wi::to_widest (niters_skip);
12548           else
12549             iv_limit += max_vf - 1;
12550         }
12551       else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
12552         /* Make a conservatively-correct assumption.  */
12553         iv_limit += max_vf - 1;
12554
12555       /* IV_LIMIT is the maximum number of latch iterations, which is also
12556          the maximum in-range IV value.  Round this value down to the previous
12557          vector alignment boundary and then add an extra full iteration.  */
12558       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
12559       iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
12560     }
12561   return iv_limit;
12562 }
12563
12564 /* For the given rgroup_controls RGC, check whether an induction variable
12565    would ever hit a value that produces a set of all-false masks or zero
12566    lengths before wrapping around.  Return true if it's possible to wrap
12567    around before hitting the desirable value, otherwise return false.  */
12568
12569 bool
12570 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
12571 {
12572   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
12573
12574   if (iv_limit == -1)
12575     return true;
12576
12577   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
12578   unsigned int compare_precision = TYPE_PRECISION (compare_type);
12579   unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
12580
12581   if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
12582     return true;
12583
12584   return false;
12585 }