gcc/tree-vect-loop.cc

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2024 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #define INCLUDE_ALGORITHM
  23 #include "config.h"
  24 #include "system.h"
  25 #include "coretypes.h"
  26 #include "backend.h"
  27 #include "target.h"
  28 #include "rtl.h"
  29 #include "tree.h"
  30 #include "gimple.h"
  31 #include "cfghooks.h"
  32 #include "tree-pass.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "memmodel.h"
  36 #include "optabs.h"
  37 #include "diagnostic-core.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "cfganal.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop-niter.h"
  47 #include "tree-ssa-loop.h"
  48 #include "cfgloop.h"
  49 #include "tree-scalar-evolution.h"
  50 #include "tree-vectorizer.h"
  51 #include "gimple-fold.h"
  52 #include "cgraph.h"
  53 #include "tree-cfg.h"
  54 #include "tree-if-conv.h"
  55 #include "internal-fn.h"
  56 #include "tree-vector-builder.h"
  57 #include "vec-perm-indices.h"
  58 #include "tree-eh.h"
  59 #include "case-cfn-macros.h"
  60 #include "langhooks.h"
  61
  62 /* Loop Vectorization Pass.
  63
  64    This pass tries to vectorize loops.
  65
  66    For example, the vectorizer transforms the following simple loop:
  67
  68         short a[N]; short b[N]; short c[N]; int i;
  69
  70         for (i=0; i<N; i++){
  71           a[i] = b[i] + c[i];
  72         }
  73
  74    as if it was manually vectorized by rewriting the source code into:
  75
  76         typedef int __attribute__((mode(V8HI))) v8hi;
  77         short a[N];  short b[N]; short c[N];   int i;
  78         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  79         v8hi va, vb, vc;
  80
  81         for (i=0; i<N/8; i++){
  82           vb = pb[i];
  83           vc = pc[i];
  84           va = vb + vc;
  85           pa[i] = va;
  86         }
  87
  88         The main entry to this pass is vectorize_loops(), in which
  89    the vectorizer applies a set of analyses on a given set of loops,
  90    followed by the actual vectorization transformation for the loops that
  91    had successfully passed the analysis phase.
  92         Throughout this pass we make a distinction between two types of
  93    data: scalars (which are represented by SSA_NAMES), and memory references
  94    ("data-refs").  These two types of data require different handling both
  95    during analysis and transformation. The types of data-refs that the
  96    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  97    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  98    accesses are required to have a simple (consecutive) access pattern.
  99
 100    Analysis phase:
 101    ===============
 102         The driver for the analysis phase is vect_analyze_loop().
 103    It applies a set of analyses, some of which rely on the scalar evolution
 104    analyzer (scev) developed by Sebastian Pop.
 105
 106         During the analysis phase the vectorizer records some information
 107    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 108    loop, as well as general information about the loop as a whole, which is
 109    recorded in a "loop_vec_info" struct attached to each loop.
 110
 111    Transformation phase:
 112    =====================
 113         The loop transformation phase scans all the stmts in the loop, and
 114    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 115    the loop that needs to be vectorized.  It inserts the vector code sequence
 116    just before the scalar stmt S, and records a pointer to the vector code
 117    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 118    attached to S).  This pointer will be used for the vectorization of following
 119    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 120    otherwise, we rely on dead code elimination for removing it.
 121
 122         For example, say stmt S1 was vectorized into stmt VS1:
 123
 124    VS1: vb = px[i];
 125    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 126    S2:  a = b;
 127
 128    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 129    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 130    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 131    resulting sequence would be:
 132
 133    VS1: vb = px[i];
 134    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 135    VS2: va = vb;
 136    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 137
 138         Operands that are not SSA_NAMEs, are data-refs that appear in
 139    load/store operations (like 'x[i]' in S1), and are handled differently.
 140
 141    Target modeling:
 142    =================
 143         Currently the only target specific information that is used is the
 144    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 145    Targets that can support different sizes of vectors, for now will need
 146    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 147    flexibility will be added in the future.
 148
 149         Since we only vectorize operations which vector form can be
 150    expressed using existing tree codes, to verify that an operation is
 151    supported, the vectorizer checks the relevant optab at the relevant
 152    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 153    the value found is CODE_FOR_nothing, then there's no target support, and
 154    we can't vectorize the stmt.
 155
 156    For additional information on this project see:
 157    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 158 */
 159
 160 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
 161                                                 unsigned *);
 162 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
 163                                                bool *, bool *, bool);
 164
 165 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 166    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 167    may already be set for general statements (not just data refs).  */
 168
 169 static opt_result
 170 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
 171                               bool vectype_maybe_set_p,
 172                               poly_uint64 *vf)
 173 {
 174   gimple *stmt = stmt_info->stmt;
 175
 176   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 177        && !STMT_VINFO_LIVE_P (stmt_info))
 178       || gimple_clobber_p (stmt))
 179     {
 180       if (dump_enabled_p ())
 181         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 182       return opt_result::success ();
 183     }
 184
 185   tree stmt_vectype, nunits_vectype;
 186   opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
 187                                                    &stmt_vectype,
 188                                                    &nunits_vectype);
 189   if (!res)
 190     return res;
 191
 192   if (stmt_vectype)
 193     {
 194       if (STMT_VINFO_VECTYPE (stmt_info))
 195         /* The only case when a vectype had been already set is for stmts
 196            that contain a data ref, or for "pattern-stmts" (stmts generated
 197            by the vectorizer to represent/replace a certain idiom).  */
 198         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 199                      || vectype_maybe_set_p)
 200                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 201       else
 202         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 203     }
 204
 205   if (nunits_vectype)
 206     vect_update_max_nunits (vf, nunits_vectype);
 207
 208   return opt_result::success ();
 209 }
 210
 211 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 212    types of STMT_INFO and all attached pattern statements and update
 213    the vectorization factor VF accordingly.  Return true on success
 214    or false if something prevented vectorization.  */
 215
 216 static opt_result
 217 vect_determine_vf_for_stmt (vec_info *vinfo,
 218                             stmt_vec_info stmt_info, poly_uint64 *vf)
 219 {
 220   if (dump_enabled_p ())
 221     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
 222                      stmt_info->stmt);
 223   opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
 224   if (!res)
 225     return res;
 226
 227   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 228       && STMT_VINFO_RELATED_STMT (stmt_info))
 229     {
 230       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 231       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
 232
 233       /* If a pattern statement has def stmts, analyze them too.  */
 234       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 235            !gsi_end_p (si); gsi_next (&si))
 236         {
 237           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 238           if (dump_enabled_p ())
 239             dump_printf_loc (MSG_NOTE, vect_location,
 240                              "==> examining pattern def stmt: %G",
 241                              def_stmt_info->stmt);
 242           res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
 243           if (!res)
 244             return res;
 245         }
 246
 247       if (dump_enabled_p ())
 248         dump_printf_loc (MSG_NOTE, vect_location,
 249                          "==> examining pattern statement: %G",
 250                          stmt_info->stmt);
 251       res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
 252       if (!res)
 253         return res;
 254     }
 255
 256   return opt_result::success ();
 257 }
 258
 259 /* Function vect_determine_vectorization_factor
 260
 261    Determine the vectorization factor (VF).  VF is the number of data elements
 262    that are operated upon in parallel in a single iteration of the vectorized
 263    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 264    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 265    elements can fit in a single vector register.
 266
 267    We currently support vectorization of loops in which all types operated upon
 268    are of the same size.  Therefore this function currently sets VF according to
 269    the size of the types operated upon, and fails if there are multiple sizes
 270    in the loop.
 271
 272    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 273    original loop:
 274         for (i=0; i<N; i++){
 275           a[i] = b[i] + c[i];
 276         }
 277
 278    vectorized loop:
 279         for (i=0; i<N; i+=VF){
 280           a[i:VF] = b[i:VF] + c[i:VF];
 281         }
 282 */
 283
 284 static opt_result
 285 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 286 {
 287   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 288   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 289   unsigned nbbs = loop->num_nodes;
 290   poly_uint64 vectorization_factor = 1;
 291   tree scalar_type = NULL_TREE;
 292   gphi *phi;
 293   tree vectype;
 294   stmt_vec_info stmt_info;
 295   unsigned i;
 296
 297   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 298
 299   for (i = 0; i < nbbs; i++)
 300     {
 301       basic_block bb = bbs[i];
 302
 303       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 304            gsi_next (&si))
 305         {
 306           phi = si.phi ();
 307           stmt_info = loop_vinfo->lookup_stmt (phi);
 308           if (dump_enabled_p ())
 309             dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
 310                              (gimple *) phi);
 311
 312           gcc_assert (stmt_info);
 313
 314           if (STMT_VINFO_RELEVANT_P (stmt_info)
 315               || STMT_VINFO_LIVE_P (stmt_info))
 316             {
 317               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 318               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 319
 320               if (dump_enabled_p ())
 321                 dump_printf_loc (MSG_NOTE, vect_location,
 322                                  "get vectype for scalar type:  %T\n",
 323                                  scalar_type);
 324
 325               vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 326               if (!vectype)
 327                 return opt_result::failure_at (phi,
 328                                                "not vectorized: unsupported "
 329                                                "data-type %T\n",
 330                                                scalar_type);
 331               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 332
 333               if (dump_enabled_p ())
 334                 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
 335                                  vectype);
 336
 337               if (dump_enabled_p ())
 338                 {
 339                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 340                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 341                   dump_printf (MSG_NOTE, "\n");
 342                 }
 343
 344               vect_update_max_nunits (&vectorization_factor, vectype);
 345             }
 346         }
 347
 348       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 349            gsi_next (&si))
 350         {
 351           if (is_gimple_debug (gsi_stmt (si)))
 352             continue;
 353           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 354           opt_result res
 355             = vect_determine_vf_for_stmt (loop_vinfo,
 356                                           stmt_info, &vectorization_factor);
 357           if (!res)
 358             return res;
 359         }
 360     }
 361
 362   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 363   if (dump_enabled_p ())
 364     {
 365       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 366       dump_dec (MSG_NOTE, vectorization_factor);
 367       dump_printf (MSG_NOTE, "\n");
 368     }
 369
 370   if (known_le (vectorization_factor, 1U))
 371     return opt_result::failure_at (vect_location,
 372                                    "not vectorized: unsupported data-type\n");
 373   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 374   return opt_result::success ();
 375 }
 376
 377
 378 /* Function vect_is_simple_iv_evolution.
 379
 380    FORNOW: A simple evolution of an induction variables in the loop is
 381    considered a polynomial evolution.  */
 382
 383 static bool
 384 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 385                              tree * step)
 386 {
 387   tree init_expr;
 388   tree step_expr;
 389   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 390   basic_block bb;
 391
 392   /* When there is no evolution in this loop, the evolution function
 393      is not "simple".  */
 394   if (evolution_part == NULL_TREE)
 395     return false;
 396
 397   /* When the evolution is a polynomial of degree >= 2
 398      the evolution function is not "simple".  */
 399   if (tree_is_chrec (evolution_part))
 400     return false;
 401
 402   step_expr = evolution_part;
 403   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 404
 405   if (dump_enabled_p ())
 406     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
 407                      step_expr, init_expr);
 408
 409   *init = init_expr;
 410   *step = step_expr;
 411
 412   if (TREE_CODE (step_expr) != INTEGER_CST
 413       && (TREE_CODE (step_expr) != SSA_NAME
 414           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 415               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 416           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 417               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 418                   || !flag_associative_math)))
 419       && (TREE_CODE (step_expr) != REAL_CST
 420           || !flag_associative_math))
 421     {
 422       if (dump_enabled_p ())
 423         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 424                          "step unknown.\n");
 425       return false;
 426     }
 427
 428   return true;
 429 }
 430
 431 /* Function vect_is_nonlinear_iv_evolution
 432
 433    Only support nonlinear induction for integer type
 434    1. neg
 435    2. mul by constant
 436    3. lshift/rshift by constant.
 437
 438    For neg induction, return a fake step as integer -1.  */
 439 static bool
 440 vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
 441                                 gphi* loop_phi_node, tree *init, tree *step)
 442 {
 443   tree init_expr, ev_expr, result, op1, op2;
 444   gimple* def;
 445
 446   if (gimple_phi_num_args (loop_phi_node) != 2)
 447     return false;
 448
 449   init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop));
 450   ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop));
 451
 452   /* Support nonlinear induction only for integer type.  */
 453   if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
 454     return false;
 455
 456   *init = init_expr;
 457   result = PHI_RESULT (loop_phi_node);
 458
 459   if (TREE_CODE (ev_expr) != SSA_NAME
 460       || ((def = SSA_NAME_DEF_STMT (ev_expr)), false)
 461       || !is_gimple_assign (def))
 462     return false;
 463
 464   enum tree_code t_code = gimple_assign_rhs_code (def);
 465   switch (t_code)
 466     {
 467     case NEGATE_EXPR:
 468       if (gimple_assign_rhs1 (def) != result)
 469         return false;
 470       *step = build_int_cst (TREE_TYPE (init_expr), -1);
 471       STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
 472       break;
 473
 474     case RSHIFT_EXPR:
 475     case LSHIFT_EXPR:
 476     case MULT_EXPR:
 477       op1 = gimple_assign_rhs1 (def);
 478       op2 = gimple_assign_rhs2 (def);
 479       if (TREE_CODE (op2) != INTEGER_CST
 480           || op1 != result)
 481         return false;
 482       *step = op2;
 483       if (t_code == LSHIFT_EXPR)
 484         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
 485       else if (t_code == RSHIFT_EXPR)
 486         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
 487       /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul.  */
 488       else
 489         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
 490       break;
 491
 492     default:
 493       return false;
 494     }
 495
 496   STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = *init;
 497   STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = *step;
 498
 499   return true;
 500 }
 501
 502 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
 503    what we are assuming is a double reduction.  For example, given
 504    a structure like this:
 505
 506       outer1:
 507         x_1 = PHI <x_4(outer2), ...>;
 508         ...
 509
 510       inner:
 511         x_2 = PHI <x_1(outer1), ...>;
 512         ...
 513         x_3 = ...;
 514         ...
 515
 516       outer2:
 517         x_4 = PHI <x_3(inner)>;
 518         ...
 519
 520    outer loop analysis would treat x_1 as a double reduction phi and
 521    this function would then return true for x_2.  */
 522
 523 static bool
 524 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
 525 {
 526   use_operand_p use_p;
 527   ssa_op_iter op_iter;
 528   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
 529     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
 530       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
 531         return true;
 532   return false;
 533 }
 534
 535 /* Returns true if Phi is a first-order recurrence. A first-order
 536    recurrence is a non-reduction recurrence relation in which the value of
 537    the recurrence in the current loop iteration equals a value defined in
 538    the previous iteration.  */
 539
 540 static bool
 541 vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop,
 542                                    gphi *phi)
 543 {
 544   /* A nested cycle isn't vectorizable as first order recurrence.  */
 545   if (LOOP_VINFO_LOOP (loop_vinfo) != loop)
 546     return false;
 547
 548   /* Ensure the loop latch definition is from within the loop.  */
 549   edge latch = loop_latch_edge (loop);
 550   tree ldef = PHI_ARG_DEF_FROM_EDGE (phi, latch);
 551   if (TREE_CODE (ldef) != SSA_NAME
 552       || SSA_NAME_IS_DEFAULT_DEF (ldef)
 553       || is_a <gphi *> (SSA_NAME_DEF_STMT (ldef))
 554       || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (ldef))))
 555     return false;
 556
 557   tree def = gimple_phi_result (phi);
 558
 559   /* Ensure every use_stmt of the phi node is dominated by the latch
 560      definition.  */
 561   imm_use_iterator imm_iter;
 562   use_operand_p use_p;
 563   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
 564     if (!is_gimple_debug (USE_STMT (use_p))
 565         && (SSA_NAME_DEF_STMT (ldef) == USE_STMT (use_p)
 566             || !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (ldef),
 567                                             USE_STMT (use_p))))
 568       return false;
 569
 570   /* First-order recurrence autovectorization needs shuffle vector.  */
 571   tree scalar_type = TREE_TYPE (def);
 572   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 573   if (!vectype)
 574     return false;
 575
 576   return true;
 577 }
 578
 579 /* Function vect_analyze_scalar_cycles_1.
 580
 581    Examine the cross iteration def-use cycles of scalar variables
 582    in LOOP.  LOOP_VINFO represents the loop that is now being
 583    considered for vectorization (can be LOOP, or an outer-loop
 584    enclosing LOOP).  SLP indicates there will be some subsequent
 585    slp analyses or not.  */
 586
 587 static void
 588 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop,
 589                               bool slp)
 590 {
 591   basic_block bb = loop->header;
 592   tree init, step;
 593   auto_vec<stmt_vec_info, 64> worklist;
 594   gphi_iterator gsi;
 595   bool double_reduc, reduc_chain;
 596
 597   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 598
 599   /* First - identify all inductions.  Reduction detection assumes that all the
 600      inductions have been identified, therefore, this order must not be
 601      changed.  */
 602   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 603     {
 604       gphi *phi = gsi.phi ();
 605       tree access_fn = NULL;
 606       tree def = PHI_RESULT (phi);
 607       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 608
 609       if (dump_enabled_p ())
 610         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
 611                          (gimple *) phi);
 612
 613       /* Skip virtual phi's.  The data dependences that are associated with
 614          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 615       if (virtual_operand_p (def))
 616         continue;
 617
 618       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 619
 620       /* Analyze the evolution function.  */
 621       access_fn = analyze_scalar_evolution (loop, def);
 622       if (access_fn)
 623         {
 624           STRIP_NOPS (access_fn);
 625           if (dump_enabled_p ())
 626             dump_printf_loc (MSG_NOTE, vect_location,
 627                              "Access function of PHI: %T\n", access_fn);
 628           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 629             = initial_condition_in_loop_num (access_fn, loop->num);
 630           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 631             = evolution_part_in_loop_num (access_fn, loop->num);
 632         }
 633
 634       if ((!access_fn
 635            || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
 636            || !vect_is_simple_iv_evolution (loop->num, access_fn,
 637                                             &init, &step)
 638            || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 639                && TREE_CODE (step) != INTEGER_CST))
 640           /* Only handle nonlinear iv for same loop.  */
 641           && (LOOP_VINFO_LOOP (loop_vinfo) != loop
 642               || !vect_is_nonlinear_iv_evolution (loop, stmt_vinfo,
 643                                                   phi, &init, &step)))
 644         {
 645           worklist.safe_push (stmt_vinfo);
 646           continue;
 647         }
 648
 649       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 650                   != NULL_TREE);
 651       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 652
 653       if (dump_enabled_p ())
 654         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 655       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 656     }
 657
 658
 659   /* Second - identify all reductions and nested cycles.  */
 660   while (worklist.length () > 0)
 661     {
 662       stmt_vec_info stmt_vinfo = worklist.pop ();
 663       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
 664       tree def = PHI_RESULT (phi);
 665
 666       if (dump_enabled_p ())
 667         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
 668                          (gimple *) phi);
 669
 670       gcc_assert (!virtual_operand_p (def)
 671                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 672
 673       stmt_vec_info reduc_stmt_info
 674         = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
 675                                     &reduc_chain, slp);
 676       if (reduc_stmt_info)
 677         {
 678           STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
 679           STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
 680           if (double_reduc)
 681             {
 682               if (dump_enabled_p ())
 683                 dump_printf_loc (MSG_NOTE, vect_location,
 684                                  "Detected double reduction.\n");
 685
 686               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 687               STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
 688             }
 689           else
 690             {
 691               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 692                 {
 693                   if (dump_enabled_p ())
 694                     dump_printf_loc (MSG_NOTE, vect_location,
 695                                      "Detected vectorizable nested cycle.\n");
 696
 697                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 698                 }
 699               else
 700                 {
 701                   if (dump_enabled_p ())
 702                     dump_printf_loc (MSG_NOTE, vect_location,
 703                                      "Detected reduction.\n");
 704
 705                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 706                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
 707                   /* Store the reduction cycles for possible vectorization in
 708                      loop-aware SLP if it was not detected as reduction
 709                      chain.  */
 710                   if (! reduc_chain)
 711                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
 712                       (reduc_stmt_info);
 713                 }
 714             }
 715         }
 716       else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi))
 717         STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence;
 718       else
 719         if (dump_enabled_p ())
 720           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 721                            "Unknown def-use cycle pattern.\n");
 722     }
 723 }
 724
 725
 726 /* Function vect_analyze_scalar_cycles.
 727
 728    Examine the cross iteration def-use cycles of scalar variables, by
 729    analyzing the loop-header PHIs of scalar variables.  Classify each
 730    cycle as one of the following: invariant, induction, reduction, unknown.
 731    We do that for the loop represented by LOOP_VINFO, and also to its
 732    inner-loop, if exists.
 733    Examples for scalar cycles:
 734
 735    Example1: reduction:
 736
 737               loop1:
 738               for (i=0; i<N; i++)
 739                  sum += a[i];
 740
 741    Example2: induction:
 742
 743               loop2:
 744               for (i=0; i<N; i++)
 745                  a[i] = i;  */
 746
 747 static void
 748 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo, bool slp)
 749 {
 750   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 751
 752   vect_analyze_scalar_cycles_1 (loop_vinfo, loop, slp);
 753
 754   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 755      Reductions in such inner-loop therefore have different properties than
 756      the reductions in the nest that gets vectorized:
 757      1. When vectorized, they are executed in the same order as in the original
 758         scalar loop, so we can't change the order of computation when
 759         vectorizing them.
 760      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 761         current checks are too strict.  */
 762
 763   if (loop->inner)
 764     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner, slp);
 765 }
 766
 767 /* Transfer group and reduction information from STMT_INFO to its
 768    pattern stmt.  */
 769
 770 static void
 771 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
 772 {
 773   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
 774   stmt_vec_info stmtp;
 775   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
 776               && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
 777   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
 778   do
 779     {
 780       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
 781       gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
 782                            == STMT_VINFO_DEF_TYPE (stmt_info));
 783       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
 784       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
 785       if (stmt_info)
 786         REDUC_GROUP_NEXT_ELEMENT (stmtp)
 787           = STMT_VINFO_RELATED_STMT (stmt_info);
 788     }
 789   while (stmt_info);
 790 }
 791
 792 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 793
 794 static void
 795 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 796 {
 797   stmt_vec_info first;
 798   unsigned i;
 799
 800   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 801     {
 802       stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
 803       while (next)
 804         {
 805           if ((STMT_VINFO_IN_PATTERN_P (next)
 806                != STMT_VINFO_IN_PATTERN_P (first))
 807               || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
 808             break;
 809           next = REDUC_GROUP_NEXT_ELEMENT (next);
 810         }
 811       /* If all reduction chain members are well-formed patterns adjust
 812          the group to group the pattern stmts instead.  */
 813       if (! next
 814           && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
 815         {
 816           if (STMT_VINFO_IN_PATTERN_P (first))
 817             {
 818               vect_fixup_reduc_chain (first);
 819               LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 820                 = STMT_VINFO_RELATED_STMT (first);
 821             }
 822         }
 823       /* If not all stmt in the chain are patterns or if we failed
 824          to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
 825          it as regular reduction instead.  */
 826       else
 827         {
 828           stmt_vec_info vinfo = first;
 829           stmt_vec_info last = NULL;
 830           while (vinfo)
 831             {
 832               next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
 833               REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
 834               REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
 835               last = vinfo;
 836               vinfo = next;
 837             }
 838           STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
 839             = vect_internal_def;
 840           loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
 841           LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
 842           --i;
 843         }
 844     }
 845 }
 846
 847 /* Function vect_get_loop_niters.
 848
 849    Determine how many iterations the loop is executed and place it
 850    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 851    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 852    niter information holds in ASSUMPTIONS.
 853
 854    Return the loop exit conditions.  */
 855
 856
 857 static vec<gcond *>
 858 vect_get_loop_niters (class loop *loop, const_edge main_exit, tree *assumptions,
 859                       tree *number_of_iterations, tree *number_of_iterationsm1)
 860 {
 861   auto_vec<edge> exits = get_loop_exit_edges (loop);
 862   vec<gcond *> conds;
 863   conds.create (exits.length ());
 864   class tree_niter_desc niter_desc;
 865   tree niter_assumptions, niter, may_be_zero;
 866
 867   *assumptions = boolean_true_node;
 868   *number_of_iterationsm1 = chrec_dont_know;
 869   *number_of_iterations = chrec_dont_know;
 870
 871   DUMP_VECT_SCOPE ("get_loop_niters");
 872
 873   if (exits.is_empty ())
 874     return conds;
 875
 876   if (dump_enabled_p ())
 877     dump_printf_loc (MSG_NOTE, vect_location, "Loop has %d exits.\n",
 878                      exits.length ());
 879
 880   edge exit;
 881   unsigned int i;
 882   FOR_EACH_VEC_ELT (exits, i, exit)
 883     {
 884       gcond *cond = get_loop_exit_condition (exit);
 885       if (cond)
 886         conds.safe_push (cond);
 887
 888       if (dump_enabled_p ())
 889         dump_printf_loc (MSG_NOTE, vect_location, "Analyzing exit %d...\n", i);
 890
 891       if (exit != main_exit)
 892         continue;
 893
 894       may_be_zero = NULL_TREE;
 895       if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 896           || chrec_contains_undetermined (niter_desc.niter))
 897         continue;
 898
 899       niter_assumptions = niter_desc.assumptions;
 900       may_be_zero = niter_desc.may_be_zero;
 901       niter = niter_desc.niter;
 902
 903       if (may_be_zero && integer_zerop (may_be_zero))
 904         may_be_zero = NULL_TREE;
 905
 906       if (may_be_zero)
 907         {
 908           if (COMPARISON_CLASS_P (may_be_zero))
 909             {
 910               /* Try to combine may_be_zero with assumptions, this can simplify
 911                  computation of niter expression.  */
 912               if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 913                 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 914                                                  niter_assumptions,
 915                                                  fold_build1 (TRUTH_NOT_EXPR,
 916                                                               boolean_type_node,
 917                                                               may_be_zero));
 918               else
 919                 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 920                                      build_int_cst (TREE_TYPE (niter), 0),
 921                                      rewrite_to_non_trapping_overflow (niter));
 922
 923               may_be_zero = NULL_TREE;
 924             }
 925           else if (integer_nonzerop (may_be_zero))
 926             {
 927               *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 928               *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 929               continue;
 930             }
 931           else
 932             continue;
 933        }
 934
 935       /* Loop assumptions are based off the normal exit.  */
 936       *assumptions = niter_assumptions;
 937       *number_of_iterationsm1 = niter;
 938
 939       /* We want the number of loop header executions which is the number
 940          of latch executions plus one.
 941          ???  For UINT_MAX latch executions this number overflows to zero
 942          for loops like do { n++; } while (n != 0);  */
 943       if (niter && !chrec_contains_undetermined (niter))
 944         {
 945           niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter),
 946                                unshare_expr (niter),
 947                                build_int_cst (TREE_TYPE (niter), 1));
 948           if (TREE_CODE (niter) == INTEGER_CST
 949               && TREE_CODE (*number_of_iterationsm1) != INTEGER_CST)
 950             {
 951               /* If we manage to fold niter + 1 into INTEGER_CST even when
 952                  niter is some complex expression, ensure back
 953                  *number_of_iterationsm1 is an INTEGER_CST as well.  See
 954                  PR113210.  */
 955               *number_of_iterationsm1
 956                 = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), niter,
 957                                build_minus_one_cst (TREE_TYPE (niter)));
 958             }
 959         }
 960       *number_of_iterations = niter;
 961     }
 962
 963   if (dump_enabled_p ())
 964     dump_printf_loc (MSG_NOTE, vect_location, "All loop exits successfully analyzed.\n");
 965
 966   return conds;
 967 }
 968
 969 /*  Determine the main loop exit for the vectorizer.  */
 970
 971 edge
 972 vec_init_loop_exit_info (class loop *loop)
 973 {
 974   /* Before we begin we must first determine which exit is the main one and
 975      which are auxilary exits.  */
 976   auto_vec<edge> exits = get_loop_exit_edges (loop);
 977   if (exits.length () == 1)
 978     return exits[0];
 979
 980   /* If we have multiple exits we only support counting IV at the moment.
 981      Analyze all exits and return the last one we can analyze.  */
 982   class tree_niter_desc niter_desc;
 983   edge candidate = NULL;
 984   for (edge exit : exits)
 985     {
 986       if (!get_loop_exit_condition (exit))
 987         continue;
 988
 989       if (number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 990           && !chrec_contains_undetermined (niter_desc.niter))
 991         {
 992           tree may_be_zero = niter_desc.may_be_zero;
 993           if ((integer_zerop (may_be_zero)
 994                || integer_nonzerop (may_be_zero)
 995                || COMPARISON_CLASS_P (may_be_zero))
 996               && (!candidate
 997                   || dominated_by_p (CDI_DOMINATORS, exit->src,
 998                                      candidate->src)))
 999             candidate = exit;
1000         }
1001     }
1002
1003   return candidate;
1004 }
1005
1006 /* Function bb_in_loop_p
1007
1008    Used as predicate for dfs order traversal of the loop bbs.  */
1009
1010 static bool
1011 bb_in_loop_p (const_basic_block bb, const void *data)
1012 {
1013   const class loop *const loop = (const class loop *)data;
1014   if (flow_bb_inside_loop_p (loop, bb))
1015     return true;
1016   return false;
1017 }
1018
1019
1020 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1021    stmt_vec_info structs for all the stmts in LOOP_IN.  */
1022
1023 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
1024   : vec_info (vec_info::loop, shared),
1025     loop (loop_in),
1026     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
1027     num_itersm1 (NULL_TREE),
1028     num_iters (NULL_TREE),
1029     num_iters_unchanged (NULL_TREE),
1030     num_iters_assumptions (NULL_TREE),
1031     vector_costs (nullptr),
1032     scalar_costs (nullptr),
1033     th (0),
1034     versioning_threshold (0),
1035     vectorization_factor (0),
1036     main_loop_edge (nullptr),
1037     skip_main_loop_edge (nullptr),
1038     skip_this_loop_edge (nullptr),
1039     reusable_accumulators (),
1040     suggested_unroll_factor (1),
1041     max_vectorization_factor (0),
1042     mask_skip_niters (NULL_TREE),
1043     rgroup_compare_type (NULL_TREE),
1044     simd_if_cond (NULL_TREE),
1045     partial_vector_style (vect_partial_vectors_none),
1046     unaligned_dr (NULL),
1047     peeling_for_alignment (0),
1048     ptr_mask (0),
1049     ivexpr_map (NULL),
1050     scan_map (NULL),
1051     slp_unrolling_factor (1),
1052     inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
1053     vectorizable (false),
1054     can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
1055     using_partial_vectors_p (false),
1056     using_decrementing_iv_p (false),
1057     using_select_vl_p (false),
1058     epil_using_partial_vectors_p (false),
1059     partial_load_store_bias (0),
1060     peeling_for_gaps (false),
1061     peeling_for_niter (false),
1062     early_breaks (false),
1063     no_data_dependencies (false),
1064     has_mask_store (false),
1065     scalar_loop_scaling (profile_probability::uninitialized ()),
1066     scalar_loop (NULL),
1067     orig_loop_info (NULL),
1068     vec_loop_iv_exit (NULL),
1069     vec_epilogue_loop_iv_exit (NULL),
1070     scalar_loop_iv_exit (NULL)
1071 {
1072   /* CHECKME: We want to visit all BBs before their successors (except for
1073      latch blocks, for which this assertion wouldn't hold).  In the simple
1074      case of the loop forms we allow, a dfs order of the BBs would the same
1075      as reversed postorder traversal, so we are safe.  */
1076
1077   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1078                                           bbs, loop->num_nodes, loop);
1079   gcc_assert (nbbs == loop->num_nodes);
1080
1081   for (unsigned int i = 0; i < nbbs; i++)
1082     {
1083       basic_block bb = bbs[i];
1084       gimple_stmt_iterator si;
1085
1086       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1087         {
1088           gimple *phi = gsi_stmt (si);
1089           gimple_set_uid (phi, 0);
1090           add_stmt (phi);
1091         }
1092
1093       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1094         {
1095           gimple *stmt = gsi_stmt (si);
1096           gimple_set_uid (stmt, 0);
1097           if (is_gimple_debug (stmt))
1098             continue;
1099           add_stmt (stmt);
1100           /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
1101              third argument is the #pragma omp simd if (x) condition, when 0,
1102              loop shouldn't be vectorized, when non-zero constant, it should
1103              be vectorized normally, otherwise versioned with vectorized loop
1104              done if the condition is non-zero at runtime.  */
1105           if (loop_in->simduid
1106               && is_gimple_call (stmt)
1107               && gimple_call_internal_p (stmt)
1108               && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
1109               && gimple_call_num_args (stmt) >= 3
1110               && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
1111               && (loop_in->simduid
1112                   == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
1113             {
1114               tree arg = gimple_call_arg (stmt, 2);
1115               if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
1116                 simd_if_cond = arg;
1117               else
1118                 gcc_assert (integer_nonzerop (arg));
1119             }
1120         }
1121     }
1122
1123   epilogue_vinfos.create (6);
1124 }
1125
1126 /* Free all levels of rgroup CONTROLS.  */
1127
1128 void
1129 release_vec_loop_controls (vec<rgroup_controls> *controls)
1130 {
1131   rgroup_controls *rgc;
1132   unsigned int i;
1133   FOR_EACH_VEC_ELT (*controls, i, rgc)
1134     rgc->controls.release ();
1135   controls->release ();
1136 }
1137
1138 /* Free all memory used by the _loop_vec_info, as well as all the
1139    stmt_vec_info structs of all the stmts in the loop.  */
1140
1141 _loop_vec_info::~_loop_vec_info ()
1142 {
1143   free (bbs);
1144
1145   release_vec_loop_controls (&masks.rgc_vec);
1146   release_vec_loop_controls (&lens);
1147   delete ivexpr_map;
1148   delete scan_map;
1149   epilogue_vinfos.release ();
1150   delete scalar_costs;
1151   delete vector_costs;
1152
1153   /* When we release an epiloge vinfo that we do not intend to use
1154      avoid clearing AUX of the main loop which should continue to
1155      point to the main loop vinfo since otherwise we'll leak that.  */
1156   if (loop->aux == this)
1157     loop->aux = NULL;
1158 }
1159
1160 /* Return an invariant or register for EXPR and emit necessary
1161    computations in the LOOP_VINFO loop preheader.  */
1162
1163 tree
1164 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
1165 {
1166   if (is_gimple_reg (expr)
1167       || is_gimple_min_invariant (expr))
1168     return expr;
1169
1170   if (! loop_vinfo->ivexpr_map)
1171     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
1172   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
1173   if (! cached)
1174     {
1175       gimple_seq stmts = NULL;
1176       cached = force_gimple_operand (unshare_expr (expr),
1177                                      &stmts, true, NULL_TREE);
1178       if (stmts)
1179         {
1180           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1181           gsi_insert_seq_on_edge_immediate (e, stmts);
1182         }
1183     }
1184   return cached;
1185 }
1186
1187 /* Return true if we can use CMP_TYPE as the comparison type to produce
1188    all masks required to mask LOOP_VINFO.  */
1189
1190 static bool
1191 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1192 {
1193   rgroup_controls *rgm;
1194   unsigned int i;
1195   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1196     if (rgm->type != NULL_TREE
1197         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1198                                             cmp_type, rgm->type,
1199                                             OPTIMIZE_FOR_SPEED))
1200       return false;
1201   return true;
1202 }
1203
1204 /* Calculate the maximum number of scalars per iteration for every
1205    rgroup in LOOP_VINFO.  */
1206
1207 static unsigned int
1208 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1209 {
1210   unsigned int res = 1;
1211   unsigned int i;
1212   rgroup_controls *rgm;
1213   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1214     res = MAX (res, rgm->max_nscalars_per_iter);
1215   return res;
1216 }
1217
1218 /* Calculate the minimum precision necessary to represent:
1219
1220       MAX_NITERS * FACTOR
1221
1222    as an unsigned integer, where MAX_NITERS is the maximum number of
1223    loop header iterations for the original scalar form of LOOP_VINFO.  */
1224
1225 static unsigned
1226 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1227 {
1228   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1229
1230   /* Get the maximum number of iterations that is representable
1231      in the counter type.  */
1232   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1233   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1234
1235   /* Get a more refined estimate for the number of iterations.  */
1236   widest_int max_back_edges;
1237   if (max_loop_iterations (loop, &max_back_edges))
1238     max_ni = wi::smin (max_ni, max_back_edges + 1);
1239
1240   /* Work out how many bits we need to represent the limit.  */
1241   return wi::min_precision (max_ni * factor, UNSIGNED);
1242 }
1243
1244 /* True if the loop needs peeling or partial vectors when vectorized.  */
1245
1246 static bool
1247 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1248 {
1249   unsigned HOST_WIDE_INT const_vf;
1250   HOST_WIDE_INT max_niter
1251     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1252
1253   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1254   if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1255     th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1256                                           (loop_vinfo));
1257
1258   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1259       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1260     {
1261       /* Work out the (constant) number of iterations that need to be
1262          peeled for reasons other than niters.  */
1263       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1264       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1265         peel_niter += 1;
1266       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1267                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1268         return true;
1269     }
1270   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1271       /* ??? When peeling for gaps but not alignment, we could
1272          try to check whether the (variable) niters is known to be
1273          VF * N + 1.  That's something of a niche case though.  */
1274       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1275       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1276       || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1277            < (unsigned) exact_log2 (const_vf))
1278           /* In case of versioning, check if the maximum number of
1279              iterations is greater than th.  If they are identical,
1280              the epilogue is unnecessary.  */
1281           && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1282               || ((unsigned HOST_WIDE_INT) max_niter
1283                   /* We'd like to use LOOP_VINFO_VERSIONING_THRESHOLD
1284                      but that's only computed later based on our result.
1285                      The following is the most conservative approximation.  */
1286                   > (std::max ((unsigned HOST_WIDE_INT) th,
1287                                const_vf) / const_vf) * const_vf))))
1288     return true;
1289
1290   return false;
1291 }
1292
1293 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1294    whether we can actually generate the masks required.  Return true if so,
1295    storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE.  */
1296
1297 static bool
1298 vect_verify_full_masking (loop_vec_info loop_vinfo)
1299 {
1300   unsigned int min_ni_width;
1301
1302   /* Use a normal loop if there are no statements that need masking.
1303      This only happens in rare degenerate cases: it means that the loop
1304      has no loads, no stores, and no live-out values.  */
1305   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1306     return false;
1307
1308   /* Produce the rgroup controls.  */
1309   for (auto mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1310     {
1311       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1312       tree vectype = mask.first;
1313       unsigned nvectors = mask.second;
1314
1315       if (masks->rgc_vec.length () < nvectors)
1316         masks->rgc_vec.safe_grow_cleared (nvectors, true);
1317       rgroup_controls *rgm = &(*masks).rgc_vec[nvectors - 1];
1318       /* The number of scalars per iteration and the number of vectors are
1319          both compile-time constants.  */
1320       unsigned int nscalars_per_iter
1321           = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1322                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1323
1324       if (rgm->max_nscalars_per_iter < nscalars_per_iter)
1325         {
1326           rgm->max_nscalars_per_iter = nscalars_per_iter;
1327           rgm->type = truth_type_for (vectype);
1328           rgm->factor = 1;
1329         }
1330     }
1331
1332   unsigned int max_nscalars_per_iter
1333     = vect_get_max_nscalars_per_iter (loop_vinfo);
1334
1335   /* Work out how many bits we need to represent the limit.  */
1336   min_ni_width
1337     = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1338
1339   /* Find a scalar mode for which WHILE_ULT is supported.  */
1340   opt_scalar_int_mode cmp_mode_iter;
1341   tree cmp_type = NULL_TREE;
1342   tree iv_type = NULL_TREE;
1343   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1344   unsigned int iv_precision = UINT_MAX;
1345
1346   if (iv_limit != -1)
1347     iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1348                                       UNSIGNED);
1349
1350   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1351     {
1352       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1353       if (cmp_bits >= min_ni_width
1354           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1355         {
1356           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1357           if (this_type
1358               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1359             {
1360               /* Although we could stop as soon as we find a valid mode,
1361                  there are at least two reasons why that's not always the
1362                  best choice:
1363
1364                  - An IV that's Pmode or wider is more likely to be reusable
1365                    in address calculations than an IV that's narrower than
1366                    Pmode.
1367
1368                  - Doing the comparison in IV_PRECISION or wider allows
1369                    a natural 0-based IV, whereas using a narrower comparison
1370                    type requires mitigations against wrap-around.
1371
1372                  Conversely, if the IV limit is variable, doing the comparison
1373                  in a wider type than the original type can introduce
1374                  unnecessary extensions, so picking the widest valid mode
1375                  is not always a good choice either.
1376
1377                  Here we prefer the first IV type that's Pmode or wider,
1378                  and the first comparison type that's IV_PRECISION or wider.
1379                  (The comparison type must be no wider than the IV type,
1380                  to avoid extensions in the vector loop.)
1381
1382                  ??? We might want to try continuing beyond Pmode for ILP32
1383                  targets if CMP_BITS < IV_PRECISION.  */
1384               iv_type = this_type;
1385               if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1386                 cmp_type = this_type;
1387               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1388                 break;
1389             }
1390         }
1391     }
1392
1393   if (!cmp_type)
1394     {
1395       LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.release ();
1396       return false;
1397     }
1398
1399   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1400   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1401   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_while_ult;
1402   return true;
1403 }
1404
1405 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1406    whether we can actually generate AVX512 style masks.  Return true if so,
1407    storing the type of the scalar IV in LOOP_VINFO_RGROUP_IV_TYPE.  */
1408
1409 static bool
1410 vect_verify_full_masking_avx512 (loop_vec_info loop_vinfo)
1411 {
1412   /* Produce differently organized rgc_vec and differently check
1413      we can produce masks.  */
1414
1415   /* Use a normal loop if there are no statements that need masking.
1416      This only happens in rare degenerate cases: it means that the loop
1417      has no loads, no stores, and no live-out values.  */
1418   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1419     return false;
1420
1421   /* For the decrementing IV we need to represent all values in
1422      [0, niter + niter_skip] where niter_skip is the elements we
1423      skip in the first iteration for prologue peeling.  */
1424   tree iv_type = NULL_TREE;
1425   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1426   unsigned int iv_precision = UINT_MAX;
1427   if (iv_limit != -1)
1428     iv_precision = wi::min_precision (iv_limit, UNSIGNED);
1429
1430   /* First compute the type for the IV we use to track the remaining
1431      scalar iterations.  */
1432   opt_scalar_int_mode cmp_mode_iter;
1433   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1434     {
1435       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1436       if (cmp_bits >= iv_precision
1437           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1438         {
1439           iv_type = build_nonstandard_integer_type (cmp_bits, true);
1440           if (iv_type)
1441             break;
1442         }
1443     }
1444   if (!iv_type)
1445     return false;
1446
1447   /* Produce the rgroup controls.  */
1448   for (auto const &mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1449     {
1450       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1451       tree vectype = mask.first;
1452       unsigned nvectors = mask.second;
1453
1454       /* The number of scalars per iteration and the number of vectors are
1455          both compile-time constants.  */
1456       unsigned int nscalars_per_iter
1457         = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1458                      LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1459
1460       /* We index the rgroup_controls vector with nscalars_per_iter
1461          which we keep constant and instead have a varying nvectors,
1462          remembering the vector mask with the fewest nV.  */
1463       if (masks->rgc_vec.length () < nscalars_per_iter)
1464         masks->rgc_vec.safe_grow_cleared (nscalars_per_iter, true);
1465       rgroup_controls *rgm = &(*masks).rgc_vec[nscalars_per_iter - 1];
1466
1467       if (!rgm->type || rgm->factor > nvectors)
1468         {
1469           rgm->type = truth_type_for (vectype);
1470           rgm->compare_type = NULL_TREE;
1471           rgm->max_nscalars_per_iter = nscalars_per_iter;
1472           rgm->factor = nvectors;
1473           rgm->bias_adjusted_ctrl = NULL_TREE;
1474         }
1475     }
1476
1477   /* There is no fixed compare type we are going to use but we have to
1478      be able to get at one for each mask group.  */
1479   unsigned int min_ni_width
1480     = wi::min_precision (vect_max_vf (loop_vinfo), UNSIGNED);
1481
1482   bool ok = true;
1483   for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
1484     {
1485       tree mask_type = rgc.type;
1486       if (!mask_type)
1487         continue;
1488
1489       /* For now vect_get_loop_mask only supports integer mode masks
1490          when we need to split it.  */
1491       if (GET_MODE_CLASS (TYPE_MODE (mask_type)) != MODE_INT
1492           || TYPE_PRECISION (TREE_TYPE (mask_type)) != 1)
1493         {
1494           ok = false;
1495           break;
1496         }
1497
1498       /* If iv_type is usable as compare type use that - we can elide the
1499          saturation in that case.   */
1500       if (TYPE_PRECISION (iv_type) >= min_ni_width)
1501         {
1502           tree cmp_vectype
1503             = build_vector_type (iv_type, TYPE_VECTOR_SUBPARTS (mask_type));
1504           if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1505             rgc.compare_type = cmp_vectype;
1506         }
1507       if (!rgc.compare_type)
1508         FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1509           {
1510             unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1511             if (cmp_bits >= min_ni_width
1512                 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1513               {
1514                 tree cmp_type = build_nonstandard_integer_type (cmp_bits, true);
1515                 if (!cmp_type)
1516                   continue;
1517
1518                 /* Check whether we can produce the mask with cmp_type.  */
1519                 tree cmp_vectype
1520                   = build_vector_type (cmp_type, TYPE_VECTOR_SUBPARTS (mask_type));
1521                 if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1522                   {
1523                     rgc.compare_type = cmp_vectype;
1524                     break;
1525                   }
1526               }
1527         }
1528       if (!rgc.compare_type)
1529         {
1530           ok = false;
1531           break;
1532         }
1533     }
1534   if (!ok)
1535     {
1536       release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
1537       return false;
1538     }
1539
1540   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = error_mark_node;
1541   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1542   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_avx512;
1543   return true;
1544 }
1545
1546 /* Check whether we can use vector access with length based on precison
1547    comparison.  So far, to keep it simple, we only allow the case that the
1548    precision of the target supported length is larger than the precision
1549    required by loop niters.  */
1550
1551 static bool
1552 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1553 {
1554   if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1555     return false;
1556
1557   machine_mode len_load_mode, len_store_mode;
1558   if (!get_len_load_store_mode (loop_vinfo->vector_mode, true)
1559          .exists (&len_load_mode))
1560     return false;
1561   if (!get_len_load_store_mode (loop_vinfo->vector_mode, false)
1562          .exists (&len_store_mode))
1563     return false;
1564
1565   signed char partial_load_bias = internal_len_load_store_bias
1566     (IFN_LEN_LOAD, len_load_mode);
1567
1568   signed char partial_store_bias = internal_len_load_store_bias
1569     (IFN_LEN_STORE, len_store_mode);
1570
1571   gcc_assert (partial_load_bias == partial_store_bias);
1572
1573   if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1574     return false;
1575
1576   /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1577      len_loads with a length of zero.  In order to avoid that we prohibit
1578      more than one loop length here.  */
1579   if (partial_load_bias == -1
1580       && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1581     return false;
1582
1583   LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1584
1585   unsigned int max_nitems_per_iter = 1;
1586   unsigned int i;
1587   rgroup_controls *rgl;
1588   /* Find the maximum number of items per iteration for every rgroup.  */
1589   FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1590     {
1591       unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1592       max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1593     }
1594
1595   /* Work out how many bits we need to represent the length limit.  */
1596   unsigned int min_ni_prec
1597     = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1598
1599   /* Now use the maximum of below precisions for one suitable IV type:
1600      - the IV's natural precision
1601      - the precision needed to hold: the maximum number of scalar
1602        iterations multiplied by the scale factor (min_ni_prec above)
1603      - the Pmode precision
1604
1605      If min_ni_prec is less than the precision of the current niters,
1606      we perfer to still use the niters type.  Prefer to use Pmode and
1607      wider IV to avoid narrow conversions.  */
1608
1609   unsigned int ni_prec
1610     = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1611   min_ni_prec = MAX (min_ni_prec, ni_prec);
1612   min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1613
1614   tree iv_type = NULL_TREE;
1615   opt_scalar_int_mode tmode_iter;
1616   FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1617     {
1618       scalar_mode tmode = tmode_iter.require ();
1619       unsigned int tbits = GET_MODE_BITSIZE (tmode);
1620
1621       /* ??? Do we really want to construct one IV whose precision exceeds
1622          BITS_PER_WORD?  */
1623       if (tbits > BITS_PER_WORD)
1624         break;
1625
1626       /* Find the first available standard integral type.  */
1627       if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1628         {
1629           iv_type = build_nonstandard_integer_type (tbits, true);
1630           break;
1631         }
1632     }
1633
1634   if (!iv_type)
1635     {
1636       if (dump_enabled_p ())
1637         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1638                          "can't vectorize with length-based partial vectors"
1639                          " because there is no suitable iv type.\n");
1640       return false;
1641     }
1642
1643   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1644   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1645   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_len;
1646
1647   return true;
1648 }
1649
1650 /* Calculate the cost of one scalar iteration of the loop.  */
1651 static void
1652 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1653 {
1654   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1655   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1656   int nbbs = loop->num_nodes, factor;
1657   int innerloop_iters, i;
1658
1659   DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1660
1661   /* Gather costs for statements in the scalar loop.  */
1662
1663   /* FORNOW.  */
1664   innerloop_iters = 1;
1665   if (loop->inner)
1666     innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1667
1668   for (i = 0; i < nbbs; i++)
1669     {
1670       gimple_stmt_iterator si;
1671       basic_block bb = bbs[i];
1672
1673       if (bb->loop_father == loop->inner)
1674         factor = innerloop_iters;
1675       else
1676         factor = 1;
1677
1678       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1679         {
1680           gimple *stmt = gsi_stmt (si);
1681           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1682
1683           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1684             continue;
1685
1686           /* Skip stmts that are not vectorized inside the loop.  */
1687           stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1688           if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1689               && (!STMT_VINFO_LIVE_P (vstmt_info)
1690                   || !VECTORIZABLE_CYCLE_DEF
1691                         (STMT_VINFO_DEF_TYPE (vstmt_info))))
1692             continue;
1693
1694           vect_cost_for_stmt kind;
1695           if (STMT_VINFO_DATA_REF (stmt_info))
1696             {
1697               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1698                kind = scalar_load;
1699              else
1700                kind = scalar_store;
1701             }
1702           else if (vect_nop_conversion_p (stmt_info))
1703             continue;
1704           else
1705             kind = scalar_stmt;
1706
1707           /* We are using vect_prologue here to avoid scaling twice
1708              by the inner loop factor.  */
1709           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1710                             factor, kind, stmt_info, 0, vect_prologue);
1711         }
1712     }
1713
1714   /* Now accumulate cost.  */
1715   loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1716   add_stmt_costs (loop_vinfo->scalar_costs,
1717                   &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1718   loop_vinfo->scalar_costs->finish_cost (nullptr);
1719 }
1720
1721 /* Function vect_analyze_loop_form.
1722
1723    Verify that certain CFG restrictions hold, including:
1724    - the loop has a pre-header
1725    - the loop has a single entry
1726    - nested loops can have only a single exit.
1727    - the loop exit condition is simple enough
1728    - the number of iterations can be analyzed, i.e, a countable loop.  The
1729      niter could be analyzed under some assumptions.  */
1730
1731 opt_result
1732 vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info)
1733 {
1734   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1735
1736   edge exit_e = vec_init_loop_exit_info (loop);
1737   if (!exit_e)
1738     return opt_result::failure_at (vect_location,
1739                                    "not vectorized:"
1740                                    " could not determine main exit from"
1741                                    " loop with multiple exits.\n");
1742   info->loop_exit = exit_e;
1743   if (dump_enabled_p ())
1744       dump_printf_loc (MSG_NOTE, vect_location,
1745                        "using as main loop exit: %d -> %d [AUX: %p]\n",
1746                        exit_e->src->index, exit_e->dest->index, exit_e->aux);
1747
1748   /* Check if we have any control flow that doesn't leave the loop.  */
1749   class loop *v_loop = loop->inner ? loop->inner : loop;
1750   basic_block *bbs = get_loop_body (v_loop);
1751   for (unsigned i = 0; i < v_loop->num_nodes; i++)
1752     if (EDGE_COUNT (bbs[i]->succs) != 1
1753         && (EDGE_COUNT (bbs[i]->succs) != 2
1754             || !loop_exits_from_bb_p (bbs[i]->loop_father, bbs[i])))
1755       {
1756         free (bbs);
1757         return opt_result::failure_at (vect_location,
1758                                        "not vectorized:"
1759                                        " unsupported control flow in loop.\n");
1760       }
1761   free (bbs);
1762
1763   /* Different restrictions apply when we are considering an inner-most loop,
1764      vs. an outer (nested) loop.
1765      (FORNOW. May want to relax some of these restrictions in the future).  */
1766
1767   info->inner_loop_cond = NULL;
1768   if (!loop->inner)
1769     {
1770       /* Inner-most loop.  */
1771
1772       if (empty_block_p (loop->header))
1773         return opt_result::failure_at (vect_location,
1774                                        "not vectorized: empty loop.\n");
1775     }
1776   else
1777     {
1778       class loop *innerloop = loop->inner;
1779       edge entryedge;
1780
1781       /* Nested loop. We currently require that the loop is doubly-nested,
1782          contains a single inner loop with a single exit to the block
1783          with the single exit condition in the outer loop.
1784          Vectorizable outer-loops look like this:
1785
1786                         (pre-header)
1787                            |
1788                           header <---+
1789                            |         |
1790                           inner-loop |
1791                            |         |
1792                           tail ------+
1793                            |
1794                         (exit-bb)
1795
1796          The inner-loop also has the properties expected of inner-most loops
1797          as described above.  */
1798
1799       if ((loop->inner)->inner || (loop->inner)->next)
1800         return opt_result::failure_at (vect_location,
1801                                        "not vectorized:"
1802                                        " multiple nested loops.\n");
1803
1804       entryedge = loop_preheader_edge (innerloop);
1805       if (entryedge->src != loop->header
1806           || !single_exit (innerloop)
1807           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1808         return opt_result::failure_at (vect_location,
1809                                        "not vectorized:"
1810                                        " unsupported outerloop form.\n");
1811
1812       /* Analyze the inner-loop.  */
1813       vect_loop_form_info inner;
1814       opt_result res = vect_analyze_loop_form (loop->inner, &inner);
1815       if (!res)
1816         {
1817           if (dump_enabled_p ())
1818             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1819                              "not vectorized: Bad inner loop.\n");
1820           return res;
1821         }
1822
1823       /* Don't support analyzing niter under assumptions for inner
1824          loop.  */
1825       if (!integer_onep (inner.assumptions))
1826         return opt_result::failure_at (vect_location,
1827                                        "not vectorized: Bad inner loop.\n");
1828
1829       if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1830         return opt_result::failure_at (vect_location,
1831                                        "not vectorized: inner-loop count not"
1832                                        " invariant.\n");
1833
1834       if (dump_enabled_p ())
1835         dump_printf_loc (MSG_NOTE, vect_location,
1836                          "Considering outer-loop vectorization.\n");
1837       info->inner_loop_cond = inner.conds[0];
1838     }
1839
1840   if (EDGE_COUNT (loop->header->preds) != 2)
1841     return opt_result::failure_at (vect_location,
1842                                    "not vectorized:"
1843                                    " too many incoming edges.\n");
1844
1845   /* We assume that the latch is empty.  */
1846   if (!empty_block_p (loop->latch)
1847       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1848     return opt_result::failure_at (vect_location,
1849                                    "not vectorized: latch block not empty.\n");
1850
1851   /* Make sure there is no abnormal exit.  */
1852   auto_vec<edge> exits = get_loop_exit_edges (loop);
1853   for (edge e : exits)
1854     {
1855       if (e->flags & EDGE_ABNORMAL)
1856         return opt_result::failure_at (vect_location,
1857                                        "not vectorized:"
1858                                        " abnormal loop exit edge.\n");
1859     }
1860
1861   info->conds
1862     = vect_get_loop_niters (loop, exit_e, &info->assumptions,
1863                             &info->number_of_iterations,
1864                             &info->number_of_iterationsm1);
1865   if (info->conds.is_empty ())
1866     return opt_result::failure_at
1867       (vect_location,
1868        "not vectorized: complicated exit condition.\n");
1869
1870   /* Determine what the primary and alternate exit conds are.  */
1871   for (unsigned i = 0; i < info->conds.length (); i++)
1872     {
1873       gcond *cond = info->conds[i];
1874       if (exit_e->src == gimple_bb (cond))
1875         std::swap (info->conds[0], info->conds[i]);
1876     }
1877
1878   if (integer_zerop (info->assumptions)
1879       || !info->number_of_iterations
1880       || chrec_contains_undetermined (info->number_of_iterations))
1881     return opt_result::failure_at
1882       (info->conds[0],
1883        "not vectorized: number of iterations cannot be computed.\n");
1884
1885   if (integer_zerop (info->number_of_iterations))
1886     return opt_result::failure_at
1887       (info->conds[0],
1888        "not vectorized: number of iterations = 0.\n");
1889
1890   if (!(tree_fits_shwi_p (info->number_of_iterations)
1891         && tree_to_shwi (info->number_of_iterations) > 0))
1892     {
1893       if (dump_enabled_p ())
1894         {
1895           dump_printf_loc (MSG_NOTE, vect_location,
1896                            "Symbolic number of iterations is ");
1897           dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1898           dump_printf (MSG_NOTE, "\n");
1899         }
1900     }
1901
1902   return opt_result::success ();
1903 }
1904
1905 /* Create a loop_vec_info for LOOP with SHARED and the
1906    vect_analyze_loop_form result.  */
1907
1908 loop_vec_info
1909 vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1910                         const vect_loop_form_info *info,
1911                         loop_vec_info main_loop_info)
1912 {
1913   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1914   LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1915   LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1916   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1917   LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = main_loop_info;
1918   /* Also record the assumptions for versioning.  */
1919   if (!integer_onep (info->assumptions) && !main_loop_info)
1920     LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1921
1922   for (gcond *cond : info->conds)
1923     {
1924       stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (cond);
1925       STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1926       /* Mark the statement as a condition.  */
1927       STMT_VINFO_DEF_TYPE (loop_cond_info) = vect_condition_def;
1928     }
1929
1930   for (unsigned i = 1; i < info->conds.length (); i ++)
1931     LOOP_VINFO_LOOP_CONDS (loop_vinfo).safe_push (info->conds[i]);
1932   LOOP_VINFO_LOOP_IV_COND (loop_vinfo) = info->conds[0];
1933
1934   LOOP_VINFO_IV_EXIT (loop_vinfo) = info->loop_exit;
1935
1936   /* Check to see if we're vectorizing multiple exits.  */
1937   LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
1938     = !LOOP_VINFO_LOOP_CONDS (loop_vinfo).is_empty ();
1939
1940   if (info->inner_loop_cond)
1941     {
1942       stmt_vec_info inner_loop_cond_info
1943         = loop_vinfo->lookup_stmt (info->inner_loop_cond);
1944       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1945       /* If we have an estimate on the number of iterations of the inner
1946          loop use that to limit the scale for costing, otherwise use
1947          --param vect-inner-loop-cost-factor literally.  */
1948       widest_int nit;
1949       if (estimated_stmt_executions (loop->inner, &nit))
1950         LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1951           = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1952     }
1953
1954   return loop_vinfo;
1955 }
1956
1957
1958
1959 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1960    statements update the vectorization factor.  */
1961
1962 static void
1963 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1964 {
1965   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1966   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1967   int nbbs = loop->num_nodes;
1968   poly_uint64 vectorization_factor;
1969   int i;
1970
1971   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1972
1973   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1974   gcc_assert (known_ne (vectorization_factor, 0U));
1975
1976   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1977      vectorization factor of the loop is the unrolling factor required by
1978      the SLP instances.  If that unrolling factor is 1, we say, that we
1979      perform pure SLP on loop - cross iteration parallelism is not
1980      exploited.  */
1981   bool only_slp_in_loop = true;
1982   for (i = 0; i < nbbs; i++)
1983     {
1984       basic_block bb = bbs[i];
1985       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1986            gsi_next (&si))
1987         {
1988           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1989           if (!stmt_info)
1990             continue;
1991           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1992                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1993               && !PURE_SLP_STMT (stmt_info))
1994             /* STMT needs both SLP and loop-based vectorization.  */
1995             only_slp_in_loop = false;
1996         }
1997       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1998            gsi_next (&si))
1999         {
2000           if (is_gimple_debug (gsi_stmt (si)))
2001             continue;
2002           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2003           stmt_info = vect_stmt_to_vectorize (stmt_info);
2004           if ((STMT_VINFO_RELEVANT_P (stmt_info)
2005                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2006               && !PURE_SLP_STMT (stmt_info))
2007             /* STMT needs both SLP and loop-based vectorization.  */
2008             only_slp_in_loop = false;
2009         }
2010     }
2011
2012   if (only_slp_in_loop)
2013     {
2014       if (dump_enabled_p ())
2015         dump_printf_loc (MSG_NOTE, vect_location,
2016                          "Loop contains only SLP stmts\n");
2017       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
2018     }
2019   else
2020     {
2021       if (dump_enabled_p ())
2022         dump_printf_loc (MSG_NOTE, vect_location,
2023                          "Loop contains SLP and non-SLP stmts\n");
2024       /* Both the vectorization factor and unroll factor have the form
2025          GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
2026          so they must have a common multiple.  */
2027       vectorization_factor
2028         = force_common_multiple (vectorization_factor,
2029                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
2030     }
2031
2032   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
2033   if (dump_enabled_p ())
2034     {
2035       dump_printf_loc (MSG_NOTE, vect_location,
2036                        "Updating vectorization factor to ");
2037       dump_dec (MSG_NOTE, vectorization_factor);
2038       dump_printf (MSG_NOTE, ".\n");
2039     }
2040 }
2041
2042 /* Return true if STMT_INFO describes a double reduction phi and if
2043    the other phi in the reduction is also relevant for vectorization.
2044    This rejects cases such as:
2045
2046       outer1:
2047         x_1 = PHI <x_3(outer2), ...>;
2048         ...
2049
2050       inner:
2051         x_2 = ...;
2052         ...
2053
2054       outer2:
2055         x_3 = PHI <x_2(inner)>;
2056
2057    if nothing in x_2 or elsewhere makes x_1 relevant.  */
2058
2059 static bool
2060 vect_active_double_reduction_p (stmt_vec_info stmt_info)
2061 {
2062   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2063     return false;
2064
2065   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
2066 }
2067
2068 /* Function vect_analyze_loop_operations.
2069
2070    Scan the loop stmts and make sure they are all vectorizable.  */
2071
2072 static opt_result
2073 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
2074 {
2075   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2076   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2077   int nbbs = loop->num_nodes;
2078   int i;
2079   stmt_vec_info stmt_info;
2080   bool need_to_vectorize = false;
2081   bool ok;
2082
2083   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
2084
2085   auto_vec<stmt_info_for_cost> cost_vec;
2086
2087   for (i = 0; i < nbbs; i++)
2088     {
2089       basic_block bb = bbs[i];
2090
2091       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
2092            gsi_next (&si))
2093         {
2094           gphi *phi = si.phi ();
2095           ok = true;
2096
2097           stmt_info = loop_vinfo->lookup_stmt (phi);
2098           if (dump_enabled_p ())
2099             dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G",
2100                              (gimple *) phi);
2101           if (virtual_operand_p (gimple_phi_result (phi)))
2102             continue;
2103
2104           /* Inner-loop loop-closed exit phi in outer-loop vectorization
2105              (i.e., a phi in the tail of the outer-loop).  */
2106           if (! is_loop_header_bb_p (bb))
2107             {
2108               /* FORNOW: we currently don't support the case that these phis
2109                  are not used in the outerloop (unless it is double reduction,
2110                  i.e., this phi is vect_reduction_def), cause this case
2111                  requires to actually do something here.  */
2112               if (STMT_VINFO_LIVE_P (stmt_info)
2113                   && !vect_active_double_reduction_p (stmt_info))
2114                 return opt_result::failure_at (phi,
2115                                                "Unsupported loop-closed phi"
2116                                                " in outer-loop.\n");
2117
2118               /* If PHI is used in the outer loop, we check that its operand
2119                  is defined in the inner loop.  */
2120               if (STMT_VINFO_RELEVANT_P (stmt_info))
2121                 {
2122                   tree phi_op;
2123
2124                   if (gimple_phi_num_args (phi) != 1)
2125                     return opt_result::failure_at (phi, "unsupported phi");
2126
2127                   phi_op = PHI_ARG_DEF (phi, 0);
2128                   stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
2129                   if (!op_def_info)
2130                     return opt_result::failure_at (phi, "unsupported phi\n");
2131
2132                   if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
2133                       && (STMT_VINFO_RELEVANT (op_def_info)
2134                           != vect_used_in_outer_by_reduction))
2135                     return opt_result::failure_at (phi, "unsupported phi\n");
2136
2137                   if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2138                        || (STMT_VINFO_DEF_TYPE (stmt_info)
2139                            == vect_double_reduction_def))
2140                       && !vectorizable_lc_phi (loop_vinfo,
2141                                                stmt_info, NULL, NULL))
2142                     return opt_result::failure_at (phi, "unsupported phi\n");
2143                 }
2144
2145               continue;
2146             }
2147
2148           gcc_assert (stmt_info);
2149
2150           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
2151                || STMT_VINFO_LIVE_P (stmt_info))
2152               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def
2153               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
2154             /* A scalar-dependence cycle that we don't support.  */
2155             return opt_result::failure_at (phi,
2156                                            "not vectorized:"
2157                                            " scalar dependence cycle.\n");
2158
2159           if (STMT_VINFO_RELEVANT_P (stmt_info))
2160             {
2161               need_to_vectorize = true;
2162               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
2163                   && ! PURE_SLP_STMT (stmt_info))
2164                 ok = vectorizable_induction (loop_vinfo,
2165                                              stmt_info, NULL, NULL,
2166                                              &cost_vec);
2167               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2168                         || (STMT_VINFO_DEF_TYPE (stmt_info)
2169                             == vect_double_reduction_def)
2170                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
2171                        && ! PURE_SLP_STMT (stmt_info))
2172                 ok = vectorizable_reduction (loop_vinfo,
2173                                              stmt_info, NULL, NULL, &cost_vec);
2174               else if ((STMT_VINFO_DEF_TYPE (stmt_info)
2175                         == vect_first_order_recurrence)
2176                        && ! PURE_SLP_STMT (stmt_info))
2177                 ok = vectorizable_recurr (loop_vinfo, stmt_info, NULL, NULL,
2178                                            &cost_vec);
2179             }
2180
2181           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
2182           if (ok
2183               && STMT_VINFO_LIVE_P (stmt_info)
2184               && !PURE_SLP_STMT (stmt_info))
2185             ok = vectorizable_live_operation (loop_vinfo, stmt_info, NULL, NULL,
2186                                               -1, false, &cost_vec);
2187
2188           if (!ok)
2189             return opt_result::failure_at (phi,
2190                                            "not vectorized: relevant phi not "
2191                                            "supported: %G",
2192                                            static_cast <gimple *> (phi));
2193         }
2194
2195       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
2196            gsi_next (&si))
2197         {
2198           gimple *stmt = gsi_stmt (si);
2199           if (!gimple_clobber_p (stmt)
2200               && !is_gimple_debug (stmt))
2201             {
2202               opt_result res
2203                 = vect_analyze_stmt (loop_vinfo,
2204                                      loop_vinfo->lookup_stmt (stmt),
2205                                      &need_to_vectorize,
2206                                      NULL, NULL, &cost_vec);
2207               if (!res)
2208                 return res;
2209             }
2210         }
2211     } /* bbs */
2212
2213   add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
2214
2215   /* All operations in the loop are either irrelevant (deal with loop
2216      control, or dead), or only used outside the loop and can be moved
2217      out of the loop (e.g. invariants, inductions).  The loop can be
2218      optimized away by scalar optimizations.  We're better off not
2219      touching this loop.  */
2220   if (!need_to_vectorize)
2221     {
2222       if (dump_enabled_p ())
2223         dump_printf_loc (MSG_NOTE, vect_location,
2224                          "All the computation can be taken out of the loop.\n");
2225       return opt_result::failure_at
2226         (vect_location,
2227          "not vectorized: redundant loop. no profit to vectorize.\n");
2228     }
2229
2230   return opt_result::success ();
2231 }
2232
2233 /* Return true if we know that the iteration count is smaller than the
2234    vectorization factor.  Return false if it isn't, or if we can't be sure
2235    either way.  */
2236
2237 static bool
2238 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
2239 {
2240   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2241
2242   HOST_WIDE_INT max_niter;
2243   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2244     max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
2245   else
2246     max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2247
2248   if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
2249     return true;
2250
2251   return false;
2252 }
2253
2254 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
2255    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
2256    definitely no, or -1 if it's worth retrying.  */
2257
2258 static int
2259 vect_analyze_loop_costing (loop_vec_info loop_vinfo,
2260                            unsigned *suggested_unroll_factor)
2261 {
2262   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2263   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2264
2265   /* Only loops that can handle partially-populated vectors can have iteration
2266      counts less than the vectorization factor.  */
2267   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2268       && vect_known_niters_smaller_than_vf (loop_vinfo))
2269     {
2270       if (dump_enabled_p ())
2271         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2272                          "not vectorized: iteration count smaller than "
2273                          "vectorization factor.\n");
2274       return 0;
2275     }
2276
2277   /* If we know the number of iterations we can do better, for the
2278      epilogue we can also decide whether the main loop leaves us
2279      with enough iterations, prefering a smaller vector epilog then
2280      also possibly used for the case we skip the vector loop.  */
2281   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2282     {
2283       widest_int scalar_niters
2284         = wi::to_widest (LOOP_VINFO_NITERSM1 (loop_vinfo)) + 1;
2285       if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2286         {
2287           loop_vec_info orig_loop_vinfo
2288             = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2289           unsigned lowest_vf
2290             = constant_lower_bound (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo));
2291           int prolog_peeling = 0;
2292           if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2293             prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo);
2294           if (prolog_peeling >= 0
2295               && known_eq (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
2296                            lowest_vf))
2297             {
2298               unsigned gap
2299                 = LOOP_VINFO_PEELING_FOR_GAPS (orig_loop_vinfo) ? 1 : 0;
2300               scalar_niters = ((scalar_niters - gap - prolog_peeling)
2301                                % lowest_vf + gap);
2302             }
2303         }
2304       /* Reject vectorizing for a single scalar iteration, even if
2305          we could in principle implement that using partial vectors.  */
2306       unsigned peeling_gap = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2307       if (scalar_niters <= peeling_gap + 1)
2308         {
2309           if (dump_enabled_p ())
2310             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2311                              "not vectorized: loop only has a single "
2312                              "scalar iteration.\n");
2313           return 0;
2314         }
2315
2316       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2317         {
2318           /* Check that the loop processes at least one full vector.  */
2319           poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2320           if (known_lt (scalar_niters, vf))
2321             {
2322               if (dump_enabled_p ())
2323                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2324                                  "loop does not have enough iterations "
2325                                  "to support vectorization.\n");
2326               return 0;
2327             }
2328
2329           /* If we need to peel an extra epilogue iteration to handle data
2330              accesses with gaps, check that there are enough scalar iterations
2331              available.
2332
2333              The check above is redundant with this one when peeling for gaps,
2334              but the distinction is useful for diagnostics.  */
2335           if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2336               && known_le (scalar_niters, vf))
2337             {
2338               if (dump_enabled_p ())
2339                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2340                                  "loop does not have enough iterations "
2341                                  "to support peeling for gaps.\n");
2342               return 0;
2343             }
2344         }
2345     }
2346
2347   /* If using the "very cheap" model. reject cases in which we'd keep
2348      a copy of the scalar code (even if we might be able to vectorize it).  */
2349   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2350       && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2351           || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2352           || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
2353     {
2354       if (dump_enabled_p ())
2355         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2356                          "some scalar iterations would need to be peeled\n");
2357       return 0;
2358     }
2359
2360   int min_profitable_iters, min_profitable_estimate;
2361   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2362                                       &min_profitable_estimate,
2363                                       suggested_unroll_factor);
2364
2365   if (min_profitable_iters < 0)
2366     {
2367       if (dump_enabled_p ())
2368         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2369                          "not vectorized: vectorization not profitable.\n");
2370       if (dump_enabled_p ())
2371         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2372                          "not vectorized: vector version will never be "
2373                          "profitable.\n");
2374       return -1;
2375     }
2376
2377   int min_scalar_loop_bound = (param_min_vect_loop_bound
2378                                * assumed_vf);
2379
2380   /* Use the cost model only if it is more conservative than user specified
2381      threshold.  */
2382   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
2383                                     min_profitable_iters);
2384
2385   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2386
2387   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2388       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2389     {
2390       if (dump_enabled_p ())
2391         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2392                          "not vectorized: vectorization not profitable.\n");
2393       if (dump_enabled_p ())
2394         dump_printf_loc (MSG_NOTE, vect_location,
2395                          "not vectorized: iteration count smaller than user "
2396                          "specified loop bound parameter or minimum profitable "
2397                          "iterations (whichever is more conservative).\n");
2398       return 0;
2399     }
2400
2401   /* The static profitablity threshold min_profitable_estimate includes
2402      the cost of having to check at runtime whether the scalar loop
2403      should be used instead.  If it turns out that we don't need or want
2404      such a check, the threshold we should use for the static estimate
2405      is simply the point at which the vector loop becomes more profitable
2406      than the scalar loop.  */
2407   if (min_profitable_estimate > min_profitable_iters
2408       && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
2409       && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2410       && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2411       && !vect_apply_runtime_profitability_check_p (loop_vinfo))
2412     {
2413       if (dump_enabled_p ())
2414         dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
2415                          " choice between the scalar and vector loops\n");
2416       min_profitable_estimate = min_profitable_iters;
2417     }
2418
2419   /* If the vector loop needs multiple iterations to be beneficial then
2420      things are probably too close to call, and the conservative thing
2421      would be to stick with the scalar code.  */
2422   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2423       && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
2424     {
2425       if (dump_enabled_p ())
2426         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2427                          "one iteration of the vector loop would be"
2428                          " more expensive than the equivalent number of"
2429                          " iterations of the scalar loop\n");
2430       return 0;
2431     }
2432
2433   HOST_WIDE_INT estimated_niter;
2434
2435   /* If we are vectorizing an epilogue then we know the maximum number of
2436      scalar iterations it will cover is at least one lower than the
2437      vectorization factor of the main loop.  */
2438   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2439     estimated_niter
2440       = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
2441   else
2442     {
2443       estimated_niter = estimated_stmt_executions_int (loop);
2444       if (estimated_niter == -1)
2445         estimated_niter = likely_max_stmt_executions_int (loop);
2446     }
2447   if (estimated_niter != -1
2448       && ((unsigned HOST_WIDE_INT) estimated_niter
2449           < MAX (th, (unsigned) min_profitable_estimate)))
2450     {
2451       if (dump_enabled_p ())
2452         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2453                          "not vectorized: estimated iteration count too "
2454                          "small.\n");
2455       if (dump_enabled_p ())
2456         dump_printf_loc (MSG_NOTE, vect_location,
2457                          "not vectorized: estimated iteration count smaller "
2458                          "than specified loop bound parameter or minimum "
2459                          "profitable iterations (whichever is more "
2460                          "conservative).\n");
2461       return -1;
2462     }
2463
2464   return 1;
2465 }
2466
2467 static opt_result
2468 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
2469                            vec<data_reference_p> *datarefs,
2470                            unsigned int *n_stmts)
2471 {
2472   *n_stmts = 0;
2473   for (unsigned i = 0; i < loop->num_nodes; i++)
2474     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2475          !gsi_end_p (gsi); gsi_next (&gsi))
2476       {
2477         gimple *stmt = gsi_stmt (gsi);
2478         if (is_gimple_debug (stmt))
2479           continue;
2480         ++(*n_stmts);
2481         opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
2482                                                         NULL, 0);
2483         if (!res)
2484           {
2485             if (is_gimple_call (stmt) && loop->safelen)
2486               {
2487                 tree fndecl = gimple_call_fndecl (stmt), op;
2488                 if (fndecl == NULL_TREE
2489                     && gimple_call_internal_p (stmt, IFN_MASK_CALL))
2490                   {
2491                     fndecl = gimple_call_arg (stmt, 0);
2492                     gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
2493                     fndecl = TREE_OPERAND (fndecl, 0);
2494                     gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
2495                   }
2496                 if (fndecl != NULL_TREE)
2497                   {
2498                     cgraph_node *node = cgraph_node::get (fndecl);
2499                     if (node != NULL && node->simd_clones != NULL)
2500                       {
2501                         unsigned int j, n = gimple_call_num_args (stmt);
2502                         for (j = 0; j < n; j++)
2503                           {
2504                             op = gimple_call_arg (stmt, j);
2505                             if (DECL_P (op)
2506                                 || (REFERENCE_CLASS_P (op)
2507                                     && get_base_address (op)))
2508                               break;
2509                           }
2510                         op = gimple_call_lhs (stmt);
2511                         /* Ignore #pragma omp declare simd functions
2512                            if they don't have data references in the
2513                            call stmt itself.  */
2514                         if (j == n
2515                             && !(op
2516                                  && (DECL_P (op)
2517                                      || (REFERENCE_CLASS_P (op)
2518                                          && get_base_address (op)))))
2519                           continue;
2520                       }
2521                   }
2522               }
2523             return res;
2524           }
2525         /* If dependence analysis will give up due to the limit on the
2526            number of datarefs stop here and fail fatally.  */
2527         if (datarefs->length ()
2528             > (unsigned)param_loop_max_datarefs_for_datadeps)
2529           return opt_result::failure_at (stmt, "exceeded param "
2530                                          "loop-max-datarefs-for-datadeps\n");
2531       }
2532   return opt_result::success ();
2533 }
2534
2535 /* Look for SLP-only access groups and turn each individual access into its own
2536    group.  */
2537 static void
2538 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2539 {
2540   unsigned int i;
2541   struct data_reference *dr;
2542
2543   DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2544
2545   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2546   FOR_EACH_VEC_ELT (datarefs, i, dr)
2547     {
2548       gcc_assert (DR_REF (dr));
2549       stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
2550
2551       /* Check if the load is a part of an interleaving chain.  */
2552       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2553         {
2554           stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2555           dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2556           unsigned int group_size = DR_GROUP_SIZE (first_element);
2557
2558           /* Check if SLP-only groups.  */
2559           if (!STMT_SLP_TYPE (stmt_info)
2560               && STMT_VINFO_SLP_VECT_ONLY (first_element))
2561             {
2562               /* Dissolve the group.  */
2563               STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2564
2565               stmt_vec_info vinfo = first_element;
2566               while (vinfo)
2567                 {
2568                   stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2569                   DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2570                   DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2571                   DR_GROUP_SIZE (vinfo) = 1;
2572                   if (STMT_VINFO_STRIDED_P (first_element)
2573                       /* We cannot handle stores with gaps.  */
2574                       || DR_IS_WRITE (dr_info->dr))
2575                     {
2576                       STMT_VINFO_STRIDED_P (vinfo) = true;
2577                       DR_GROUP_GAP (vinfo) = 0;
2578                     }
2579                   else
2580                     DR_GROUP_GAP (vinfo) = group_size - 1;
2581                   /* Duplicate and adjust alignment info, it needs to
2582                      be present on each group leader, see dr_misalignment.  */
2583                   if (vinfo != first_element)
2584                     {
2585                       dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2586                       dr_info2->target_alignment = dr_info->target_alignment;
2587                       int misalignment = dr_info->misalignment;
2588                       if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2589                         {
2590                           HOST_WIDE_INT diff
2591                             = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2592                                - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2593                           unsigned HOST_WIDE_INT align_c
2594                             = dr_info->target_alignment.to_constant ();
2595                           misalignment = (misalignment + diff) % align_c;
2596                         }
2597                       dr_info2->misalignment = misalignment;
2598                     }
2599                   vinfo = next;
2600                 }
2601             }
2602         }
2603     }
2604 }
2605
2606 /* Determine if operating on full vectors for LOOP_VINFO might leave
2607    some scalar iterations still to do.  If so, decide how we should
2608    handle those scalar iterations.  The possibilities are:
2609
2610    (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2611        In this case:
2612
2613          LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2614          LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2615          LOOP_VINFO_PEELING_FOR_NITER == false
2616
2617    (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2618        to handle the remaining scalar iterations.  In this case:
2619
2620          LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2621          LOOP_VINFO_PEELING_FOR_NITER == true
2622
2623        There are two choices:
2624
2625        (2a) Consider vectorizing the epilogue loop at the same VF as the
2626             main loop, but using partial vectors instead of full vectors.
2627             In this case:
2628
2629               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2630
2631        (2b) Consider vectorizing the epilogue loop at lower VFs only.
2632             In this case:
2633
2634               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2635  */
2636
2637 opt_result
2638 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo)
2639 {
2640   /* Determine whether there would be any scalar iterations left over.  */
2641   bool need_peeling_or_partial_vectors_p
2642     = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2643
2644   /* Decide whether to vectorize the loop with partial vectors.  */
2645   LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2646   LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2647   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2648       && need_peeling_or_partial_vectors_p)
2649     {
2650       /* For partial-vector-usage=1, try to push the handling of partial
2651          vectors to the epilogue, with the main loop continuing to operate
2652          on full vectors.
2653
2654          If we are unrolling we also do not want to use partial vectors. This
2655          is to avoid the overhead of generating multiple masks and also to
2656          avoid having to execute entire iterations of FALSE masked instructions
2657          when dealing with one or less full iterations.
2658
2659          ??? We could then end up failing to use partial vectors if we
2660          decide to peel iterations into a prologue, and if the main loop
2661          then ends up processing fewer than VF iterations.  */
2662       if ((param_vect_partial_vector_usage == 1
2663            || loop_vinfo->suggested_unroll_factor > 1)
2664           && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2665           && !vect_known_niters_smaller_than_vf (loop_vinfo))
2666         LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2667       else
2668         LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2669     }
2670
2671   if (dump_enabled_p ())
2672     dump_printf_loc (MSG_NOTE, vect_location,
2673                      "operating on %s vectors%s.\n",
2674                      LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2675                      ? "partial" : "full",
2676                      LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2677                      ? " for epilogue loop" : "");
2678
2679   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2680     = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2681        && need_peeling_or_partial_vectors_p);
2682
2683   /* We set LOOP_VINFO_USING_SELECT_VL_P as true before loop vectorization
2684      analysis that we don't know whether the loop is vectorized by partial
2685      vectors (More details see tree-vect-loop-manip.cc).
2686
2687      However, SELECT_VL vectorizaton style should only applied on partial
2688      vectorization since SELECT_VL is the GIMPLE IR that calculates the
2689      number of elements to be process for each iteration.
2690
2691      After loop vectorization analysis, Clear LOOP_VINFO_USING_SELECT_VL_P
2692      if it is not partial vectorized loop.  */
2693   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2694     LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = false;
2695
2696   return opt_result::success ();
2697 }
2698
2699 /* Function vect_analyze_loop_2.
2700
2701    Apply a set of analyses on LOOP specified by LOOP_VINFO, the different
2702    analyses will record information in some members of LOOP_VINFO.  FATAL
2703    indicates if some analysis meets fatal error.  If one non-NULL pointer
2704    SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one
2705    worked out suggested unroll factor, while one NULL pointer shows it's
2706    going to apply the suggested unroll factor.  SLP_DONE_FOR_SUGGESTED_UF
2707    is to hold the slp decision when the suggested unroll factor is worked
2708    out.  */
2709 static opt_result
2710 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
2711                      unsigned *suggested_unroll_factor,
2712                      bool& slp_done_for_suggested_uf)
2713 {
2714   opt_result ok = opt_result::success ();
2715   int res;
2716   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2717   poly_uint64 min_vf = 2;
2718   loop_vec_info orig_loop_vinfo = NULL;
2719
2720   /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2721      loop_vec_info of the first vectorized loop.  */
2722   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2723     orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2724   else
2725     orig_loop_vinfo = loop_vinfo;
2726   gcc_assert (orig_loop_vinfo);
2727
2728   /* The first group of checks is independent of the vector size.  */
2729   fatal = true;
2730
2731   if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2732       && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2733     return opt_result::failure_at (vect_location,
2734                                    "not vectorized: simd if(0)\n");
2735
2736   /* Find all data references in the loop (which correspond to vdefs/vuses)
2737      and analyze their evolution in the loop.  */
2738
2739   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2740
2741   /* Gather the data references and count stmts in the loop.  */
2742   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2743     {
2744       opt_result res
2745         = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2746                                      &LOOP_VINFO_DATAREFS (loop_vinfo),
2747                                      &LOOP_VINFO_N_STMTS (loop_vinfo));
2748       if (!res)
2749         {
2750           if (dump_enabled_p ())
2751             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2752                              "not vectorized: loop contains function "
2753                              "calls or data references that cannot "
2754                              "be analyzed\n");
2755           return res;
2756         }
2757       loop_vinfo->shared->save_datarefs ();
2758     }
2759   else
2760     loop_vinfo->shared->check_datarefs ();
2761
2762   /* Analyze the data references and also adjust the minimal
2763      vectorization factor according to the loads and stores.  */
2764
2765   ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2766   if (!ok)
2767     {
2768       if (dump_enabled_p ())
2769         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2770                          "bad data references.\n");
2771       return ok;
2772     }
2773
2774   /* Check if we are applying unroll factor now.  */
2775   bool applying_suggested_uf = loop_vinfo->suggested_unroll_factor > 1;
2776   gcc_assert (!applying_suggested_uf || !suggested_unroll_factor);
2777
2778   /* If the slp decision is false when suggested unroll factor is worked
2779      out, and we are applying suggested unroll factor, we can simply skip
2780      all slp related analyses this time.  */
2781   bool slp = !applying_suggested_uf || slp_done_for_suggested_uf;
2782
2783   /* Classify all cross-iteration scalar data-flow cycles.
2784      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
2785   vect_analyze_scalar_cycles (loop_vinfo, slp);
2786
2787   vect_pattern_recog (loop_vinfo);
2788
2789   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2790
2791   /* Analyze the access patterns of the data-refs in the loop (consecutive,
2792      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
2793
2794   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2795   if (!ok)
2796     {
2797       if (dump_enabled_p ())
2798         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2799                          "bad data access.\n");
2800       return ok;
2801     }
2802
2803   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
2804
2805   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2806   if (!ok)
2807     {
2808       if (dump_enabled_p ())
2809         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2810                          "unexpected pattern.\n");
2811       return ok;
2812     }
2813
2814   /* While the rest of the analysis below depends on it in some way.  */
2815   fatal = false;
2816
2817   /* Analyze data dependences between the data-refs in the loop
2818      and adjust the maximum vectorization factor according to
2819      the dependences.
2820      FORNOW: fail at the first data dependence that we encounter.  */
2821
2822   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2823   if (!ok)
2824     {
2825       if (dump_enabled_p ())
2826         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2827                          "bad data dependence.\n");
2828       return ok;
2829     }
2830   if (max_vf != MAX_VECTORIZATION_FACTOR
2831       && maybe_lt (max_vf, min_vf))
2832     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2833   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2834
2835   ok = vect_determine_vectorization_factor (loop_vinfo);
2836   if (!ok)
2837     {
2838       if (dump_enabled_p ())
2839         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2840                          "can't determine vectorization factor.\n");
2841       return ok;
2842     }
2843
2844   /* Compute the scalar iteration cost.  */
2845   vect_compute_single_scalar_iteration_cost (loop_vinfo);
2846
2847   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2848
2849   if (slp)
2850     {
2851       /* Check the SLP opportunities in the loop, analyze and build
2852          SLP trees.  */
2853       ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo));
2854       if (!ok)
2855         return ok;
2856
2857       /* If there are any SLP instances mark them as pure_slp.  */
2858       slp = vect_make_slp_decision (loop_vinfo);
2859       if (slp)
2860         {
2861           /* Find stmts that need to be both vectorized and SLPed.  */
2862           vect_detect_hybrid_slp (loop_vinfo);
2863
2864           /* Update the vectorization factor based on the SLP decision.  */
2865           vect_update_vf_for_slp (loop_vinfo);
2866
2867           /* Optimize the SLP graph with the vectorization factor fixed.  */
2868           vect_optimize_slp (loop_vinfo);
2869
2870           /* Gather the loads reachable from the SLP graph entries.  */
2871           vect_gather_slp_loads (loop_vinfo);
2872         }
2873     }
2874
2875   bool saved_can_use_partial_vectors_p
2876     = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2877
2878   /* We don't expect to have to roll back to anything other than an empty
2879      set of rgroups.  */
2880   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2881
2882   /* This is the point where we can re-start analysis with SLP forced off.  */
2883 start_over:
2884
2885   /* Apply the suggested unrolling factor, this was determined by the backend
2886      during finish_cost the first time we ran the analyzis for this
2887      vector mode.  */
2888   if (applying_suggested_uf)
2889     LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2890
2891   /* Now the vectorization factor is final.  */
2892   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2893   gcc_assert (known_ne (vectorization_factor, 0U));
2894
2895   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2896     {
2897       dump_printf_loc (MSG_NOTE, vect_location,
2898                        "vectorization_factor = ");
2899       dump_dec (MSG_NOTE, vectorization_factor);
2900       dump_printf (MSG_NOTE, ", niters = %wd\n",
2901                    LOOP_VINFO_INT_NITERS (loop_vinfo));
2902     }
2903
2904   if (max_vf != MAX_VECTORIZATION_FACTOR
2905       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2906     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2907
2908   loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2909
2910   /* Analyze the alignment of the data-refs in the loop.
2911      Fail if a data reference is found that cannot be vectorized.  */
2912
2913   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2914   if (!ok)
2915     {
2916       if (dump_enabled_p ())
2917         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2918                          "bad data alignment.\n");
2919       return ok;
2920     }
2921
2922   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2923      It is important to call pruning after vect_analyze_data_ref_accesses,
2924      since we use grouping information gathered by interleaving analysis.  */
2925   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2926   if (!ok)
2927     return ok;
2928
2929   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2930      vectorization, since we do not want to add extra peeling or
2931      add versioning for alignment.  */
2932   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2933     /* This pass will decide on using loop versioning and/or loop peeling in
2934        order to enhance the alignment of data references in the loop.  */
2935     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2936   if (!ok)
2937     return ok;
2938
2939   if (slp)
2940     {
2941       /* Analyze operations in the SLP instances.  Note this may
2942          remove unsupported SLP instances which makes the above
2943          SLP kind detection invalid.  */
2944       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2945       vect_slp_analyze_operations (loop_vinfo);
2946       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2947         {
2948           ok = opt_result::failure_at (vect_location,
2949                                        "unsupported SLP instances\n");
2950           goto again;
2951         }
2952
2953       /* Check whether any load in ALL SLP instances is possibly permuted.  */
2954       slp_tree load_node, slp_root;
2955       unsigned i, x;
2956       slp_instance instance;
2957       bool can_use_lanes = true;
2958       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2959         {
2960           slp_root = SLP_INSTANCE_TREE (instance);
2961           int group_size = SLP_TREE_LANES (slp_root);
2962           tree vectype = SLP_TREE_VECTYPE (slp_root);
2963           bool loads_permuted = false;
2964           FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2965             {
2966               if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2967                 continue;
2968               unsigned j;
2969               stmt_vec_info load_info;
2970               FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2971                 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2972                   {
2973                     loads_permuted = true;
2974                     break;
2975                   }
2976             }
2977
2978           /* If the loads and stores can be handled with load/store-lane
2979              instructions record it and move on to the next instance.  */
2980           if (loads_permuted
2981               && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2982               && vect_store_lanes_supported (vectype, group_size, false)
2983                    != IFN_LAST)
2984             {
2985               FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2986                 if (STMT_VINFO_GROUPED_ACCESS
2987                       (SLP_TREE_REPRESENTATIVE (load_node)))
2988                   {
2989                     stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2990                         (SLP_TREE_REPRESENTATIVE (load_node));
2991                     /* Use SLP for strided accesses (or if we can't
2992                        load-lanes).  */
2993                     if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2994                         || vect_load_lanes_supported
2995                              (STMT_VINFO_VECTYPE (stmt_vinfo),
2996                               DR_GROUP_SIZE (stmt_vinfo), false) == IFN_LAST)
2997                       break;
2998                   }
2999
3000               can_use_lanes
3001                 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
3002
3003               if (can_use_lanes && dump_enabled_p ())
3004                 dump_printf_loc (MSG_NOTE, vect_location,
3005                                  "SLP instance %p can use load/store-lanes\n",
3006                                  (void *) instance);
3007             }
3008           else
3009             {
3010               can_use_lanes = false;
3011               break;
3012             }
3013         }
3014
3015       /* If all SLP instances can use load/store-lanes abort SLP and try again
3016          with SLP disabled.  */
3017       if (can_use_lanes)
3018         {
3019           ok = opt_result::failure_at (vect_location,
3020                                        "Built SLP cancelled: can use "
3021                                        "load/store-lanes\n");
3022           if (dump_enabled_p ())
3023             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3024                              "Built SLP cancelled: all SLP instances support "
3025                              "load/store-lanes\n");
3026           goto again;
3027         }
3028     }
3029
3030   /* Dissolve SLP-only groups.  */
3031   vect_dissolve_slp_only_groups (loop_vinfo);
3032
3033   /* Scan all the remaining operations in the loop that are not subject
3034      to SLP and make sure they are vectorizable.  */
3035   ok = vect_analyze_loop_operations (loop_vinfo);
3036   if (!ok)
3037     {
3038       if (dump_enabled_p ())
3039         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3040                          "bad operation or unsupported loop bound.\n");
3041       return ok;
3042     }
3043
3044   /* For now, we don't expect to mix both masking and length approaches for one
3045      loop, disable it if both are recorded.  */
3046   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3047       && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
3048       && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
3049     {
3050       if (dump_enabled_p ())
3051         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3052                          "can't vectorize a loop with partial vectors"
3053                          " because we don't expect to mix different"
3054                          " approaches with partial vectors for the"
3055                          " same loop.\n");
3056       LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3057     }
3058
3059   /* If we still have the option of using partial vectors,
3060      check whether we can generate the necessary loop controls.  */
3061   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
3062     {
3063       if (!LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
3064         {
3065           if (!vect_verify_full_masking (loop_vinfo)
3066               && !vect_verify_full_masking_avx512 (loop_vinfo))
3067             LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3068         }
3069       else /* !LOOP_VINFO_LENS (loop_vinfo).is_empty () */
3070         if (!vect_verify_loop_lens (loop_vinfo))
3071           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3072     }
3073
3074   /* If we're vectorizing a loop that uses length "controls" and
3075      can iterate more than once, we apply decrementing IV approach
3076      in loop control.  */
3077   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3078       && LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_len
3079       && LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0
3080       && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3081            && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
3082                         LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
3083     LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
3084
3085   /* If a loop uses length controls and has a decrementing loop control IV,
3086      we will normally pass that IV through a MIN_EXPR to calcaluate the
3087      basis for the length controls.  E.g. in a loop that processes one
3088      element per scalar iteration, the number of elements would be
3089      MIN_EXPR <N, VF>, where N is the number of scalar iterations left.
3090
3091      This MIN_EXPR approach allows us to use pointer IVs with an invariant
3092      step, since only the final iteration of the vector loop can have
3093      inactive lanes.
3094
3095      However, some targets have a dedicated instruction for calculating the
3096      preferred length, given the total number of elements that still need to
3097      be processed.  This is encapsulated in the SELECT_VL internal function.
3098
3099      If the target supports SELECT_VL, we can use it instead of MIN_EXPR
3100      to determine the basis for the length controls.  However, unlike the
3101      MIN_EXPR calculation, the SELECT_VL calculation can decide to make
3102      lanes inactive in any iteration of the vector loop, not just the last
3103      iteration.  This SELECT_VL approach therefore requires us to use pointer
3104      IVs with variable steps.
3105
3106      Once we've decided how many elements should be processed by one
3107      iteration of the vector loop, we need to populate the rgroup controls.
3108      If a loop has multiple rgroups, we need to make sure that those rgroups
3109      "line up" (that is, they must be consistent about which elements are
3110      active and which aren't).  This is done by vect_adjust_loop_lens_control.
3111
3112      In principle, it would be possible to use vect_adjust_loop_lens_control
3113      on either the result of a MIN_EXPR or the result of a SELECT_VL.
3114      However:
3115
3116      (1) In practice, it only makes sense to use SELECT_VL when a vector
3117          operation will be controlled directly by the result.  It is not
3118          worth using SELECT_VL if it would only be the input to other
3119          calculations.
3120
3121      (2) If we use SELECT_VL for an rgroup that has N controls, each associated
3122          pointer IV will need N updates by a variable amount (N-1 updates
3123          within the iteration and 1 update to move to the next iteration).
3124
3125      Because of this, we prefer to use the MIN_EXPR approach whenever there
3126      is more than one length control.
3127
3128      In addition, SELECT_VL always operates to a granularity of 1 unit.
3129      If we wanted to use it to control an SLP operation on N consecutive
3130      elements, we would need to make the SELECT_VL inputs measure scalar
3131      iterations (rather than elements) and then multiply the SELECT_VL
3132      result by N.  But using SELECT_VL this way is inefficient because
3133      of (1) above.
3134
3135      2. We don't apply SELECT_VL on single-rgroup when both (1) and (2) are
3136         satisfied:
3137
3138      (1). LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) is true.
3139      (2). LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () is true.
3140
3141      Since SELECT_VL (variable step) will make SCEV analysis failed and then
3142      we will fail to gain benefits of following unroll optimizations. We prefer
3143      using the MIN_EXPR approach in this situation.  */
3144   if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
3145     {
3146       tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
3147       if (direct_internal_fn_supported_p (IFN_SELECT_VL, iv_type,
3148                                           OPTIMIZE_FOR_SPEED)
3149           && LOOP_VINFO_LENS (loop_vinfo).length () == 1
3150           && LOOP_VINFO_LENS (loop_vinfo)[0].factor == 1 && !slp
3151           && (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3152               || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()))
3153         LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true;
3154     }
3155
3156   /* Decide whether this loop_vinfo should use partial vectors or peeling,
3157      assuming that the loop will be used as a main loop.  We will redo
3158      this analysis later if we instead decide to use the loop as an
3159      epilogue loop.  */
3160   ok = vect_determine_partial_vectors_and_peeling (loop_vinfo);
3161   if (!ok)
3162     return ok;
3163
3164   /* If we're vectorizing an epilogue loop, the vectorized loop either needs
3165      to be able to handle fewer than VF scalars, or needs to have a lower VF
3166      than the main loop.  */
3167   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
3168       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3169     {
3170       poly_uint64 unscaled_vf
3171         = exact_div (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
3172                      orig_loop_vinfo->suggested_unroll_factor);
3173       if (maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo), unscaled_vf))
3174         return opt_result::failure_at (vect_location,
3175                                        "Vectorization factor too high for"
3176                                        " epilogue loop.\n");
3177     }
3178
3179   /* Check the costings of the loop make vectorizing worthwhile.  */
3180   res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
3181   if (res < 0)
3182     {
3183       ok = opt_result::failure_at (vect_location,
3184                                    "Loop costings may not be worthwhile.\n");
3185       goto again;
3186     }
3187   if (!res)
3188     return opt_result::failure_at (vect_location,
3189                                    "Loop costings not worthwhile.\n");
3190
3191   /* If an epilogue loop is required make sure we can create one.  */
3192   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
3193       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
3194       || LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3195     {
3196       if (dump_enabled_p ())
3197         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
3198       if (!vect_can_advance_ivs_p (loop_vinfo)
3199           || !slpeel_can_duplicate_loop_p (loop,
3200                                            LOOP_VINFO_IV_EXIT (loop_vinfo),
3201                                            LOOP_VINFO_IV_EXIT (loop_vinfo)))
3202         {
3203           ok = opt_result::failure_at (vect_location,
3204                                        "not vectorized: can't create required "
3205                                        "epilog loop\n");
3206           goto again;
3207         }
3208     }
3209
3210   /* During peeling, we need to check if number of loop iterations is
3211      enough for both peeled prolog loop and vector loop.  This check
3212      can be merged along with threshold check of loop versioning, so
3213      increase threshold for this case if necessary.
3214
3215      If we are analyzing an epilogue we still want to check what its
3216      versioning threshold would be.  If we decide to vectorize the epilogues we
3217      will want to use the lowest versioning threshold of all epilogues and main
3218      loop.  This will enable us to enter a vectorized epilogue even when
3219      versioning the loop.  We can't simply check whether the epilogue requires
3220      versioning though since we may have skipped some versioning checks when
3221      analyzing the epilogue.  For instance, checks for alias versioning will be
3222      skipped when dealing with epilogues as we assume we already checked them
3223      for the main loop.  So instead we always check the 'orig_loop_vinfo'.  */
3224   if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
3225     {
3226       poly_uint64 niters_th = 0;
3227       unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3228
3229       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
3230         {
3231           /* Niters for peeled prolog loop.  */
3232           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3233             {
3234               dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
3235               tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
3236               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
3237             }
3238           else
3239             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3240         }
3241
3242       /* Niters for at least one iteration of vectorized loop.  */
3243       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3244         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3245       /* One additional iteration because of peeling for gap.  */
3246       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3247         niters_th += 1;
3248
3249       /*  Use the same condition as vect_transform_loop to decide when to use
3250           the cost to determine a versioning threshold.  */
3251       if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3252           && ordered_p (th, niters_th))
3253         niters_th = ordered_max (poly_uint64 (th), niters_th);
3254
3255       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
3256     }
3257
3258   gcc_assert (known_eq (vectorization_factor,
3259                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
3260
3261   slp_done_for_suggested_uf = slp;
3262
3263   /* Ok to vectorize!  */
3264   LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
3265   return opt_result::success ();
3266
3267 again:
3268   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
3269   gcc_assert (!ok);
3270
3271   /* Try again with SLP forced off but if we didn't do any SLP there is
3272      no point in re-trying.  */
3273   if (!slp)
3274     return ok;
3275
3276   /* If the slp decision is true when suggested unroll factor is worked
3277      out, and we are applying suggested unroll factor, we don't need to
3278      re-try any more.  */
3279   if (applying_suggested_uf && slp_done_for_suggested_uf)
3280     return ok;
3281
3282   /* If there are reduction chains re-trying will fail anyway.  */
3283   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
3284     return ok;
3285
3286   /* Likewise if the grouped loads or stores in the SLP cannot be handled
3287      via interleaving or lane instructions.  */
3288   slp_instance instance;
3289   slp_tree node;
3290   unsigned i, j;
3291   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
3292     {
3293       stmt_vec_info vinfo;
3294       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
3295       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
3296         continue;
3297       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3298       unsigned int size = DR_GROUP_SIZE (vinfo);
3299       tree vectype = STMT_VINFO_VECTYPE (vinfo);
3300       if (vect_store_lanes_supported (vectype, size, false) == IFN_LAST
3301          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
3302          && ! vect_grouped_store_supported (vectype, size))
3303         return opt_result::failure_at (vinfo->stmt,
3304                                        "unsupported grouped store\n");
3305       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
3306         {
3307           vinfo = SLP_TREE_REPRESENTATIVE (node);
3308           if (STMT_VINFO_GROUPED_ACCESS (vinfo))
3309             {
3310               vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3311               bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
3312               size = DR_GROUP_SIZE (vinfo);
3313               vectype = STMT_VINFO_VECTYPE (vinfo);
3314               if (vect_load_lanes_supported (vectype, size, false) == IFN_LAST
3315                   && ! vect_grouped_load_supported (vectype, single_element_p,
3316                                                     size))
3317                 return opt_result::failure_at (vinfo->stmt,
3318                                                "unsupported grouped load\n");
3319             }
3320         }
3321     }
3322
3323   if (dump_enabled_p ())
3324     dump_printf_loc (MSG_NOTE, vect_location,
3325                      "re-trying with SLP disabled\n");
3326
3327   /* Roll back state appropriately.  No SLP this time.  */
3328   slp = false;
3329   /* Restore vectorization factor as it were without SLP.  */
3330   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
3331   /* Free the SLP instances.  */
3332   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
3333     vect_free_slp_instance (instance);
3334   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
3335   /* Reset SLP type to loop_vect on all stmts.  */
3336   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
3337     {
3338       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3339       for (gimple_stmt_iterator si = gsi_start_phis (bb);
3340            !gsi_end_p (si); gsi_next (&si))
3341         {
3342           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3343           STMT_SLP_TYPE (stmt_info) = loop_vect;
3344           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
3345               || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
3346             {
3347               /* vectorizable_reduction adjusts reduction stmt def-types,
3348                  restore them to that of the PHI.  */
3349               STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
3350                 = STMT_VINFO_DEF_TYPE (stmt_info);
3351               STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
3352                                         (STMT_VINFO_REDUC_DEF (stmt_info)))
3353                 = STMT_VINFO_DEF_TYPE (stmt_info);
3354             }
3355         }
3356       for (gimple_stmt_iterator si = gsi_start_bb (bb);
3357            !gsi_end_p (si); gsi_next (&si))
3358         {
3359           if (is_gimple_debug (gsi_stmt (si)))
3360             continue;
3361           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3362           STMT_SLP_TYPE (stmt_info) = loop_vect;
3363           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
3364             {
3365               stmt_vec_info pattern_stmt_info
3366                 = STMT_VINFO_RELATED_STMT (stmt_info);
3367               if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
3368                 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
3369
3370               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
3371               STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
3372               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
3373                    !gsi_end_p (pi); gsi_next (&pi))
3374                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
3375                   = loop_vect;
3376             }
3377         }
3378     }
3379   /* Free optimized alias test DDRS.  */
3380   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
3381   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
3382   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
3383   /* Reset target cost data.  */
3384   delete loop_vinfo->vector_costs;
3385   loop_vinfo->vector_costs = nullptr;
3386   /* Reset accumulated rgroup information.  */
3387   LOOP_VINFO_MASKS (loop_vinfo).mask_set.empty ();
3388   release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
3389   release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
3390   /* Reset assorted flags.  */
3391   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
3392   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
3393   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
3394   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
3395   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3396     = saved_can_use_partial_vectors_p;
3397
3398   goto start_over;
3399 }
3400
3401 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
3402    to be better than vectorizing it using OLD_LOOP_VINFO.  Assume that
3403    OLD_LOOP_VINFO is better unless something specifically indicates
3404    otherwise.
3405
3406    Note that this deliberately isn't a partial order.  */
3407
3408 static bool
3409 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
3410                           loop_vec_info old_loop_vinfo)
3411 {
3412   struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
3413   gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
3414
3415   poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
3416   poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
3417
3418   /* Always prefer a VF of loop->simdlen over any other VF.  */
3419   if (loop->simdlen)
3420     {
3421       bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
3422       bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
3423       if (new_simdlen_p != old_simdlen_p)
3424         return new_simdlen_p;
3425     }
3426
3427   const auto *old_costs = old_loop_vinfo->vector_costs;
3428   const auto *new_costs = new_loop_vinfo->vector_costs;
3429   if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
3430     return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
3431
3432   return new_costs->better_main_loop_than_p (old_costs);
3433 }
3434
3435 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO.  Return
3436    true if we should.  */
3437
3438 static bool
3439 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
3440                         loop_vec_info old_loop_vinfo)
3441 {
3442   if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
3443     return false;
3444
3445   if (dump_enabled_p ())
3446     dump_printf_loc (MSG_NOTE, vect_location,
3447                      "***** Preferring vector mode %s to vector mode %s\n",
3448                      GET_MODE_NAME (new_loop_vinfo->vector_mode),
3449                      GET_MODE_NAME (old_loop_vinfo->vector_mode));
3450   return true;
3451 }
3452
3453 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
3454    not NULL.  Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
3455    MODE_I to the next mode useful to analyze.
3456    Return the loop_vinfo on success and wrapped null on failure.  */
3457
3458 static opt_loop_vec_info
3459 vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
3460                      const vect_loop_form_info *loop_form_info,
3461                      loop_vec_info main_loop_vinfo,
3462                      const vector_modes &vector_modes, unsigned &mode_i,
3463                      machine_mode &autodetected_vector_mode,
3464                      bool &fatal)
3465 {
3466   loop_vec_info loop_vinfo
3467     = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3468
3469   machine_mode vector_mode = vector_modes[mode_i];
3470   loop_vinfo->vector_mode = vector_mode;
3471   unsigned int suggested_unroll_factor = 1;
3472   bool slp_done_for_suggested_uf = false;
3473
3474   /* Run the main analysis.  */
3475   opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,
3476                                         &suggested_unroll_factor,
3477                                         slp_done_for_suggested_uf);
3478   if (dump_enabled_p ())
3479     dump_printf_loc (MSG_NOTE, vect_location,
3480                      "***** Analysis %s with vector mode %s\n",
3481                      res ? "succeeded" : " failed",
3482                      GET_MODE_NAME (loop_vinfo->vector_mode));
3483
3484   if (res && !main_loop_vinfo && suggested_unroll_factor > 1)
3485     {
3486       if (dump_enabled_p ())
3487         dump_printf_loc (MSG_NOTE, vect_location,
3488                          "***** Re-trying analysis for unrolling"
3489                          " with unroll factor %d and slp %s.\n",
3490                          suggested_unroll_factor,
3491                          slp_done_for_suggested_uf ? "on" : "off");
3492       loop_vec_info unroll_vinfo
3493         = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3494       unroll_vinfo->vector_mode = vector_mode;
3495       unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
3496       opt_result new_res = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL,
3497                                                 slp_done_for_suggested_uf);
3498       if (new_res)
3499         {
3500           delete loop_vinfo;
3501           loop_vinfo = unroll_vinfo;
3502         }
3503       else
3504         delete unroll_vinfo;
3505     }
3506
3507   /* Remember the autodetected vector mode.  */
3508   if (vector_mode == VOIDmode)
3509     autodetected_vector_mode = loop_vinfo->vector_mode;
3510
3511   /* Advance mode_i, first skipping modes that would result in the
3512      same analysis result.  */
3513   while (mode_i + 1 < vector_modes.length ()
3514          && vect_chooses_same_modes_p (loop_vinfo,
3515                                        vector_modes[mode_i + 1]))
3516     {
3517       if (dump_enabled_p ())
3518         dump_printf_loc (MSG_NOTE, vect_location,
3519                          "***** The result for vector mode %s would"
3520                          " be the same\n",
3521                          GET_MODE_NAME (vector_modes[mode_i + 1]));
3522       mode_i += 1;
3523     }
3524   if (mode_i + 1 < vector_modes.length ()
3525       && VECTOR_MODE_P (autodetected_vector_mode)
3526       && (related_vector_mode (vector_modes[mode_i + 1],
3527                                GET_MODE_INNER (autodetected_vector_mode))
3528           == autodetected_vector_mode)
3529       && (related_vector_mode (autodetected_vector_mode,
3530                                GET_MODE_INNER (vector_modes[mode_i + 1]))
3531           == vector_modes[mode_i + 1]))
3532     {
3533       if (dump_enabled_p ())
3534         dump_printf_loc (MSG_NOTE, vect_location,
3535                          "***** Skipping vector mode %s, which would"
3536                          " repeat the analysis for %s\n",
3537                          GET_MODE_NAME (vector_modes[mode_i + 1]),
3538                          GET_MODE_NAME (autodetected_vector_mode));
3539       mode_i += 1;
3540     }
3541   mode_i++;
3542
3543   if (!res)
3544     {
3545       delete loop_vinfo;
3546       if (fatal)
3547         gcc_checking_assert (main_loop_vinfo == NULL);
3548       return opt_loop_vec_info::propagate_failure (res);
3549     }
3550
3551   return opt_loop_vec_info::success (loop_vinfo);
3552 }
3553
3554 /* Function vect_analyze_loop.
3555
3556    Apply a set of analyses on LOOP, and create a loop_vec_info struct
3557    for it.  The different analyses will record information in the
3558    loop_vec_info struct.  */
3559 opt_loop_vec_info
3560 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
3561 {
3562   DUMP_VECT_SCOPE ("analyze_loop_nest");
3563
3564   if (loop_outer (loop)
3565       && loop_vec_info_for_loop (loop_outer (loop))
3566       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
3567     return opt_loop_vec_info::failure_at (vect_location,
3568                                           "outer-loop already vectorized.\n");
3569
3570   if (!find_loop_nest (loop, &shared->loop_nest))
3571     return opt_loop_vec_info::failure_at
3572       (vect_location,
3573        "not vectorized: loop nest containing two or more consecutive inner"
3574        " loops cannot be vectorized\n");
3575
3576   /* Analyze the loop form.  */
3577   vect_loop_form_info loop_form_info;
3578   opt_result res = vect_analyze_loop_form (loop, &loop_form_info);
3579   if (!res)
3580     {
3581       if (dump_enabled_p ())
3582         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3583                          "bad loop form.\n");
3584       return opt_loop_vec_info::propagate_failure (res);
3585     }
3586   if (!integer_onep (loop_form_info.assumptions))
3587     {
3588       /* We consider to vectorize this loop by versioning it under
3589          some assumptions.  In order to do this, we need to clear
3590          existing information computed by scev and niter analyzer.  */
3591       scev_reset_htab ();
3592       free_numbers_of_iterations_estimates (loop);
3593       /* Also set flag for this loop so that following scev and niter
3594          analysis are done under the assumptions.  */
3595       loop_constraint_set (loop, LOOP_C_FINITE);
3596     }
3597   else
3598     /* Clear the existing niter information to make sure the nonwrapping flag
3599        will be calculated and set propriately.  */
3600     free_numbers_of_iterations_estimates (loop);
3601
3602   auto_vector_modes vector_modes;
3603   /* Autodetect first vector size we try.  */
3604   vector_modes.safe_push (VOIDmode);
3605   unsigned int autovec_flags
3606     = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
3607                                                     loop->simdlen != 0);
3608   bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
3609                              && !unlimited_cost_model (loop));
3610   machine_mode autodetected_vector_mode = VOIDmode;
3611   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
3612   unsigned int mode_i = 0;
3613   unsigned HOST_WIDE_INT simdlen = loop->simdlen;
3614
3615   /* Keep track of the VF for each mode.  Initialize all to 0 which indicates
3616      a mode has not been analyzed.  */
3617   auto_vec<poly_uint64, 8> cached_vf_per_mode;
3618   for (unsigned i = 0; i < vector_modes.length (); ++i)
3619     cached_vf_per_mode.safe_push (0);
3620
3621   /* First determine the main loop vectorization mode, either the first
3622      one that works, starting with auto-detecting the vector mode and then
3623      following the targets order of preference, or the one with the
3624      lowest cost if pick_lowest_cost_p.  */
3625   while (1)
3626     {
3627       bool fatal;
3628       unsigned int last_mode_i = mode_i;
3629       /* Set cached VF to -1 prior to analysis, which indicates a mode has
3630          failed.  */
3631       cached_vf_per_mode[last_mode_i] = -1;
3632       opt_loop_vec_info loop_vinfo
3633         = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3634                                NULL, vector_modes, mode_i,
3635                                autodetected_vector_mode, fatal);
3636       if (fatal)
3637         break;
3638
3639       if (loop_vinfo)
3640         {
3641           /*  Analyzis has been successful so update the VF value.  The
3642               VF should always be a multiple of unroll_factor and we want to
3643               capture the original VF here.  */
3644           cached_vf_per_mode[last_mode_i]
3645             = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
3646                          loop_vinfo->suggested_unroll_factor);
3647           /* Once we hit the desired simdlen for the first time,
3648              discard any previous attempts.  */
3649           if (simdlen
3650               && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3651             {
3652               delete first_loop_vinfo;
3653               first_loop_vinfo = opt_loop_vec_info::success (NULL);
3654               simdlen = 0;
3655             }
3656           else if (pick_lowest_cost_p
3657                    && first_loop_vinfo
3658                    && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3659             {
3660               /* Pick loop_vinfo over first_loop_vinfo.  */
3661               delete first_loop_vinfo;
3662               first_loop_vinfo = opt_loop_vec_info::success (NULL);
3663             }
3664           if (first_loop_vinfo == NULL)
3665             first_loop_vinfo = loop_vinfo;
3666           else
3667             {
3668               delete loop_vinfo;
3669               loop_vinfo = opt_loop_vec_info::success (NULL);
3670             }
3671
3672           /* Commit to first_loop_vinfo if we have no reason to try
3673              alternatives.  */
3674           if (!simdlen && !pick_lowest_cost_p)
3675             break;
3676         }
3677       if (mode_i == vector_modes.length ()
3678           || autodetected_vector_mode == VOIDmode)
3679         break;
3680
3681       /* Try the next biggest vector size.  */
3682       if (dump_enabled_p ())
3683         dump_printf_loc (MSG_NOTE, vect_location,
3684                          "***** Re-trying analysis with vector mode %s\n",
3685                          GET_MODE_NAME (vector_modes[mode_i]));
3686     }
3687   if (!first_loop_vinfo)
3688     return opt_loop_vec_info::propagate_failure (res);
3689
3690   if (dump_enabled_p ())
3691     dump_printf_loc (MSG_NOTE, vect_location,
3692                      "***** Choosing vector mode %s\n",
3693                      GET_MODE_NAME (first_loop_vinfo->vector_mode));
3694
3695   /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3696      enabled, SIMDUID is not set, it is the innermost loop and we have
3697      either already found the loop's SIMDLEN or there was no SIMDLEN to
3698      begin with.
3699      TODO: Enable epilogue vectorization for loops with SIMDUID set.  */
3700   bool vect_epilogues = (!simdlen
3701                          && loop->inner == NULL
3702                          && param_vect_epilogues_nomask
3703                          && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3704                            /* No code motion support for multiple epilogues so for now
3705                               not supported when multiple exits.  */
3706                          && !LOOP_VINFO_EARLY_BREAKS (first_loop_vinfo)
3707                          && !loop->simduid);
3708   if (!vect_epilogues)
3709     return first_loop_vinfo;
3710
3711   /* Now analyze first_loop_vinfo for epilogue vectorization.  */
3712   poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3713
3714   /* For epilogues start the analysis from the first mode.  The motivation
3715      behind starting from the beginning comes from cases where the VECTOR_MODES
3716      array may contain length-agnostic and length-specific modes.  Their
3717      ordering is not guaranteed, so we could end up picking a mode for the main
3718      loop that is after the epilogue's optimal mode.  */
3719   vector_modes[0] = autodetected_vector_mode;
3720   mode_i = 0;
3721
3722   bool supports_partial_vectors =
3723     partial_vectors_supported_p () && param_vect_partial_vector_usage != 0;
3724   poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3725
3726   while (1)
3727     {
3728       /* If the target does not support partial vectors we can shorten the
3729          number of modes to analyze for the epilogue as we know we can't pick a
3730          mode that would lead to a VF at least as big as the
3731          FIRST_VINFO_VF.  */
3732       if (!supports_partial_vectors
3733           && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3734         {
3735           mode_i++;
3736           if (mode_i == vector_modes.length ())
3737             break;
3738           continue;
3739         }
3740
3741       if (dump_enabled_p ())
3742         dump_printf_loc (MSG_NOTE, vect_location,
3743                          "***** Re-trying epilogue analysis with vector "
3744                          "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3745
3746       bool fatal;
3747       opt_loop_vec_info loop_vinfo
3748         = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3749                                first_loop_vinfo,
3750                                vector_modes, mode_i,
3751                                autodetected_vector_mode, fatal);
3752       if (fatal)
3753         break;
3754
3755       if (loop_vinfo)
3756         {
3757           if (pick_lowest_cost_p)
3758             {
3759               /* Keep trying to roll back vectorization attempts while the
3760                  loop_vec_infos they produced were worse than this one.  */
3761               vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3762               while (!vinfos.is_empty ()
3763                      && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3764                 {
3765                   gcc_assert (vect_epilogues);
3766                   delete vinfos.pop ();
3767                 }
3768             }
3769           /* For now only allow one epilogue loop.  */
3770           if (first_loop_vinfo->epilogue_vinfos.is_empty ())
3771             {
3772               first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3773               poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3774               gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3775                           || maybe_ne (lowest_th, 0U));
3776               /* Keep track of the known smallest versioning
3777                  threshold.  */
3778               if (ordered_p (lowest_th, th))
3779                 lowest_th = ordered_min (lowest_th, th);
3780             }
3781           else
3782             {
3783               delete loop_vinfo;
3784               loop_vinfo = opt_loop_vec_info::success (NULL);
3785             }
3786
3787           /* For now only allow one epilogue loop, but allow
3788              pick_lowest_cost_p to replace it, so commit to the
3789              first epilogue if we have no reason to try alternatives.  */
3790           if (!pick_lowest_cost_p)
3791             break;
3792         }
3793
3794       if (mode_i == vector_modes.length ())
3795         break;
3796
3797     }
3798
3799   if (!first_loop_vinfo->epilogue_vinfos.is_empty ())
3800     {
3801       LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3802       if (dump_enabled_p ())
3803         dump_printf_loc (MSG_NOTE, vect_location,
3804                          "***** Choosing epilogue vector mode %s\n",
3805                          GET_MODE_NAME
3806                            (first_loop_vinfo->epilogue_vinfos[0]->vector_mode));
3807     }
3808
3809   return first_loop_vinfo;
3810 }
3811
3812 /* Return true if there is an in-order reduction function for CODE, storing
3813    it in *REDUC_FN if so.  */
3814
3815 static bool
3816 fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3817 {
3818   /* We support MINUS_EXPR by negating the operand.  This also preserves an
3819      initial -0.0 since -0.0 - 0.0 (neutral op for MINUS_EXPR) == -0.0 +
3820      (-0.0) = -0.0.  */
3821   if (code == PLUS_EXPR || code == MINUS_EXPR)
3822     {
3823       *reduc_fn = IFN_FOLD_LEFT_PLUS;
3824       return true;
3825     }
3826   return false;
3827 }
3828
3829 /* Function reduction_fn_for_scalar_code
3830
3831    Input:
3832    CODE - tree_code of a reduction operations.
3833
3834    Output:
3835    REDUC_FN - the corresponding internal function to be used to reduce the
3836       vector of partial results into a single scalar result, or IFN_LAST
3837       if the operation is a supported reduction operation, but does not have
3838       such an internal function.
3839
3840    Return FALSE if CODE currently cannot be vectorized as reduction.  */
3841
3842 bool
3843 reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3844 {
3845   if (code.is_tree_code ())
3846     switch (tree_code (code))
3847       {
3848       case MAX_EXPR:
3849         *reduc_fn = IFN_REDUC_MAX;
3850         return true;
3851
3852       case MIN_EXPR:
3853         *reduc_fn = IFN_REDUC_MIN;
3854         return true;
3855
3856       case PLUS_EXPR:
3857         *reduc_fn = IFN_REDUC_PLUS;
3858         return true;
3859
3860       case BIT_AND_EXPR:
3861         *reduc_fn = IFN_REDUC_AND;
3862         return true;
3863
3864       case BIT_IOR_EXPR:
3865         *reduc_fn = IFN_REDUC_IOR;
3866         return true;
3867
3868       case BIT_XOR_EXPR:
3869         *reduc_fn = IFN_REDUC_XOR;
3870         return true;
3871
3872       case MULT_EXPR:
3873       case MINUS_EXPR:
3874         *reduc_fn = IFN_LAST;
3875         return true;
3876
3877       default:
3878         return false;
3879       }
3880   else
3881     switch (combined_fn (code))
3882       {
3883       CASE_CFN_FMAX:
3884         *reduc_fn = IFN_REDUC_FMAX;
3885         return true;
3886
3887       CASE_CFN_FMIN:
3888         *reduc_fn = IFN_REDUC_FMIN;
3889         return true;
3890
3891       default:
3892         return false;
3893       }
3894 }
3895
3896 /* If there is a neutral value X such that a reduction would not be affected
3897    by the introduction of additional X elements, return that X, otherwise
3898    return null.  CODE is the code of the reduction and SCALAR_TYPE is type
3899    of the scalar elements.  If the reduction has just a single initial value
3900    then INITIAL_VALUE is that value, otherwise it is null.
3901    If AS_INITIAL is TRUE the value is supposed to be used as initial value.
3902    In that case no signed zero is returned.  */
3903
3904 tree
3905 neutral_op_for_reduction (tree scalar_type, code_helper code,
3906                           tree initial_value, bool as_initial)
3907 {
3908   if (code.is_tree_code ())
3909     switch (tree_code (code))
3910       {
3911       case DOT_PROD_EXPR:
3912       case SAD_EXPR:
3913       case MINUS_EXPR:
3914       case BIT_IOR_EXPR:
3915       case BIT_XOR_EXPR:
3916         return build_zero_cst (scalar_type);
3917       case WIDEN_SUM_EXPR:
3918       case PLUS_EXPR:
3919         if (!as_initial && HONOR_SIGNED_ZEROS (scalar_type))
3920           return build_real (scalar_type, dconstm0);
3921         else
3922           return build_zero_cst (scalar_type);
3923
3924       case MULT_EXPR:
3925         return build_one_cst (scalar_type);
3926
3927       case BIT_AND_EXPR:
3928         return build_all_ones_cst (scalar_type);
3929
3930       case MAX_EXPR:
3931       case MIN_EXPR:
3932         return initial_value;
3933
3934       default:
3935         return NULL_TREE;
3936       }
3937   else
3938     switch (combined_fn (code))
3939       {
3940       CASE_CFN_FMIN:
3941       CASE_CFN_FMAX:
3942         return initial_value;
3943
3944       default:
3945         return NULL_TREE;
3946       }
3947 }
3948
3949 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
3950    STMT is printed with a message MSG. */
3951
3952 static void
3953 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3954 {
3955   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3956 }
3957
3958 /* Return true if we need an in-order reduction for operation CODE
3959    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3960    overflow must wrap.  */
3961
3962 bool
3963 needs_fold_left_reduction_p (tree type, code_helper code)
3964 {
3965   /* CHECKME: check for !flag_finite_math_only too?  */
3966   if (SCALAR_FLOAT_TYPE_P (type))
3967     {
3968       if (code.is_tree_code ())
3969         switch (tree_code (code))
3970           {
3971           case MIN_EXPR:
3972           case MAX_EXPR:
3973             return false;
3974
3975           default:
3976             return !flag_associative_math;
3977           }
3978       else
3979         switch (combined_fn (code))
3980           {
3981           CASE_CFN_FMIN:
3982           CASE_CFN_FMAX:
3983             return false;
3984
3985           default:
3986             return !flag_associative_math;
3987           }
3988     }
3989
3990   if (INTEGRAL_TYPE_P (type))
3991     return (!code.is_tree_code ()
3992             || !operation_no_trapping_overflow (type, tree_code (code)));
3993
3994   if (SAT_FIXED_POINT_TYPE_P (type))
3995     return true;
3996
3997   return false;
3998 }
3999
4000 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
4001    has a handled computation expression.  Store the main reduction
4002    operation in *CODE.  */
4003
4004 static bool
4005 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
4006                       tree loop_arg, code_helper *code,
4007                       vec<std::pair<ssa_op_iter, use_operand_p> > &path)
4008 {
4009   auto_bitmap visited;
4010   tree lookfor = PHI_RESULT (phi);
4011   ssa_op_iter curri;
4012   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
4013   while (USE_FROM_PTR (curr) != loop_arg)
4014     curr = op_iter_next_use (&curri);
4015   curri.i = curri.numops;
4016   do
4017     {
4018       path.safe_push (std::make_pair (curri, curr));
4019       tree use = USE_FROM_PTR (curr);
4020       if (use == lookfor)
4021         break;
4022       gimple *def = SSA_NAME_DEF_STMT (use);
4023       if (gimple_nop_p (def)
4024           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
4025         {
4026 pop:
4027           do
4028             {
4029               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
4030               curri = x.first;
4031               curr = x.second;
4032               do
4033                 curr = op_iter_next_use (&curri);
4034               /* Skip already visited or non-SSA operands (from iterating
4035                  over PHI args).  */
4036               while (curr != NULL_USE_OPERAND_P
4037                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
4038                          || ! bitmap_set_bit (visited,
4039                                               SSA_NAME_VERSION
4040                                                 (USE_FROM_PTR (curr)))));
4041             }
4042           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
4043           if (curr == NULL_USE_OPERAND_P)
4044             break;
4045         }
4046       else
4047         {
4048           if (gimple_code (def) == GIMPLE_PHI)
4049             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
4050           else
4051             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
4052           while (curr != NULL_USE_OPERAND_P
4053                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
4054                      || ! bitmap_set_bit (visited,
4055                                           SSA_NAME_VERSION
4056                                             (USE_FROM_PTR (curr)))))
4057             curr = op_iter_next_use (&curri);
4058           if (curr == NULL_USE_OPERAND_P)
4059             goto pop;
4060         }
4061     }
4062   while (1);
4063   if (dump_file && (dump_flags & TDF_DETAILS))
4064     {
4065       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
4066       unsigned i;
4067       std::pair<ssa_op_iter, use_operand_p> *x;
4068       FOR_EACH_VEC_ELT (path, i, x)
4069         dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
4070       dump_printf (MSG_NOTE, "\n");
4071     }
4072
4073   /* Check whether the reduction path detected is valid.  */
4074   bool fail = path.length () == 0;
4075   bool neg = false;
4076   int sign = -1;
4077   *code = ERROR_MARK;
4078   for (unsigned i = 1; i < path.length (); ++i)
4079     {
4080       gimple *use_stmt = USE_STMT (path[i].second);
4081       gimple_match_op op;
4082       if (!gimple_extract_op (use_stmt, &op))
4083         {
4084           fail = true;
4085           break;
4086         }
4087       unsigned int opi = op.num_ops;
4088       if (gassign *assign = dyn_cast<gassign *> (use_stmt))
4089         {
4090           /* The following make sure we can compute the operand index
4091              easily plus it mostly disallows chaining via COND_EXPR condition
4092              operands.  */
4093           for (opi = 0; opi < op.num_ops; ++opi)
4094             if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
4095               break;
4096         }
4097       else if (gcall *call = dyn_cast<gcall *> (use_stmt))
4098         {
4099           for (opi = 0; opi < op.num_ops; ++opi)
4100             if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
4101               break;
4102         }
4103       if (opi == op.num_ops)
4104         {
4105           fail = true;
4106           break;
4107         }
4108       op.code = canonicalize_code (op.code, op.type);
4109       if (op.code == MINUS_EXPR)
4110         {
4111           op.code = PLUS_EXPR;
4112           /* Track whether we negate the reduction value each iteration.  */
4113           if (op.ops[1] == op.ops[opi])
4114             neg = ! neg;
4115         }
4116       else if (op.code == IFN_COND_SUB)
4117         {
4118           op.code = IFN_COND_ADD;
4119           /* Track whether we negate the reduction value each iteration.  */
4120           if (op.ops[2] == op.ops[opi])
4121             neg = ! neg;
4122         }
4123       if (CONVERT_EXPR_CODE_P (op.code)
4124           && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
4125         ;
4126       else if (*code == ERROR_MARK)
4127         {
4128           *code = op.code;
4129           sign = TYPE_SIGN (op.type);
4130         }
4131       else if (op.code != *code)
4132         {
4133           fail = true;
4134           break;
4135         }
4136       else if ((op.code == MIN_EXPR
4137                 || op.code == MAX_EXPR)
4138                && sign != TYPE_SIGN (op.type))
4139         {
4140           fail = true;
4141           break;
4142         }
4143       /* Check there's only a single stmt the op is used on.  For the
4144          not value-changing tail and the last stmt allow out-of-loop uses.
4145          ???  We could relax this and handle arbitrary live stmts by
4146          forcing a scalar epilogue for example.  */
4147       imm_use_iterator imm_iter;
4148       use_operand_p use_p;
4149       gimple *op_use_stmt;
4150       unsigned cnt = 0;
4151       bool cond_fn_p = op.code.is_internal_fn ()
4152         && (conditional_internal_fn_code (internal_fn (op.code))
4153             != ERROR_MARK);
4154
4155       FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
4156         {
4157         /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
4158            op1 twice (once as definition, once as else) in the same operation.
4159            Allow this.  */
4160           if (cond_fn_p && op_use_stmt == use_stmt)
4161             {
4162               gcall *call = as_a<gcall *> (use_stmt);
4163               unsigned else_pos
4164                 = internal_fn_else_index (internal_fn (op.code));
4165
4166               for (unsigned int j = 0; j < gimple_call_num_args (call); ++j)
4167                 {
4168                   if (j == else_pos)
4169                     continue;
4170                   if (gimple_call_arg (call, j) == op.ops[opi])
4171                     cnt++;
4172                 }
4173             }
4174           else if (!is_gimple_debug (op_use_stmt)
4175                    && (*code != ERROR_MARK
4176                        || flow_bb_inside_loop_p (loop,
4177                                                  gimple_bb (op_use_stmt))))
4178             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4179               cnt++;
4180         }
4181
4182       if (cnt != 1)
4183         {
4184           fail = true;
4185           break;
4186         }
4187     }
4188   return ! fail && ! neg && *code != ERROR_MARK;
4189 }
4190
4191 bool
4192 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
4193                       tree loop_arg, enum tree_code code)
4194 {
4195   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4196   code_helper code_;
4197   return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
4198           && code_ == code);
4199 }
4200
4201
4202
4203 /* Function vect_is_simple_reduction
4204
4205    (1) Detect a cross-iteration def-use cycle that represents a simple
4206    reduction computation.  We look for the following pattern:
4207
4208    loop_header:
4209      a1 = phi < a0, a2 >
4210      a3 = ...
4211      a2 = operation (a3, a1)
4212
4213    or
4214
4215    a3 = ...
4216    loop_header:
4217      a1 = phi < a0, a2 >
4218      a2 = operation (a3, a1)
4219
4220    such that:
4221    1. operation is commutative and associative and it is safe to
4222       change the order of the computation
4223    2. no uses for a2 in the loop (a2 is used out of the loop)
4224    3. no uses of a1 in the loop besides the reduction operation
4225    4. no uses of a1 outside the loop.
4226
4227    Conditions 1,4 are tested here.
4228    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
4229
4230    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
4231    nested cycles.
4232
4233    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
4234    reductions:
4235
4236      a1 = phi < a0, a2 >
4237      inner loop (def of a3)
4238      a2 = phi < a3 >
4239
4240    (4) Detect condition expressions, ie:
4241      for (int i = 0; i < N; i++)
4242        if (a[i] < val)
4243         ret_val = a[i];
4244
4245 */
4246
4247 static stmt_vec_info
4248 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
4249                           bool *double_reduc, bool *reduc_chain_p, bool slp)
4250 {
4251   gphi *phi = as_a <gphi *> (phi_info->stmt);
4252   gimple *phi_use_stmt = NULL;
4253   imm_use_iterator imm_iter;
4254   use_operand_p use_p;
4255
4256   *double_reduc = false;
4257   *reduc_chain_p = false;
4258   STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
4259
4260   tree phi_name = PHI_RESULT (phi);
4261   /* ???  If there are no uses of the PHI result the inner loop reduction
4262      won't be detected as possibly double-reduction by vectorizable_reduction
4263      because that tries to walk the PHI arg from the preheader edge which
4264      can be constant.  See PR60382.  */
4265   if (has_zero_uses (phi_name))
4266     return NULL;
4267   class loop *loop = (gimple_bb (phi))->loop_father;
4268   unsigned nphi_def_loop_uses = 0;
4269   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
4270     {
4271       gimple *use_stmt = USE_STMT (use_p);
4272       if (is_gimple_debug (use_stmt))
4273         continue;
4274
4275       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4276         {
4277           if (dump_enabled_p ())
4278             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4279                              "intermediate value used outside loop.\n");
4280
4281           return NULL;
4282         }
4283
4284       /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
4285          op1 twice (once as definition, once as else) in the same operation.
4286          Only count it as one. */
4287       if (use_stmt != phi_use_stmt)
4288         {
4289           nphi_def_loop_uses++;
4290           phi_use_stmt = use_stmt;
4291         }
4292     }
4293
4294   tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
4295   if (TREE_CODE (latch_def) != SSA_NAME)
4296     {
4297       if (dump_enabled_p ())
4298         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4299                          "reduction: not ssa_name: %T\n", latch_def);
4300       return NULL;
4301     }
4302
4303   stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
4304   if (!def_stmt_info
4305       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
4306     return NULL;
4307
4308   bool nested_in_vect_loop
4309     = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
4310   unsigned nlatch_def_loop_uses = 0;
4311   auto_vec<gphi *, 3> lcphis;
4312   bool inner_loop_of_double_reduc = false;
4313   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
4314     {
4315       gimple *use_stmt = USE_STMT (use_p);
4316       if (is_gimple_debug (use_stmt))
4317         continue;
4318       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4319         nlatch_def_loop_uses++;
4320       else
4321         {
4322           /* We can have more than one loop-closed PHI.  */
4323           lcphis.safe_push (as_a <gphi *> (use_stmt));
4324           if (nested_in_vect_loop
4325               && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
4326                   == vect_double_reduction_def))
4327             inner_loop_of_double_reduc = true;
4328         }
4329     }
4330
4331   /* If we are vectorizing an inner reduction we are executing that
4332      in the original order only in case we are not dealing with a
4333      double reduction.  */
4334   if (nested_in_vect_loop && !inner_loop_of_double_reduc)
4335     {
4336       if (dump_enabled_p ())
4337         report_vect_op (MSG_NOTE, def_stmt_info->stmt,
4338                         "detected nested cycle: ");
4339       return def_stmt_info;
4340     }
4341
4342   /* When the inner loop of a double reduction ends up with more than
4343      one loop-closed PHI we have failed to classify alternate such
4344      PHIs as double reduction, leading to wrong code.  See PR103237.  */
4345   if (inner_loop_of_double_reduc && lcphis.length () != 1)
4346     {
4347       if (dump_enabled_p ())
4348         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4349                          "unhandle double reduction\n");
4350       return NULL;
4351     }
4352
4353   /* If this isn't a nested cycle or if the nested cycle reduction value
4354      is used ouside of the inner loop we cannot handle uses of the reduction
4355      value.  */
4356   if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
4357     {
4358       if (dump_enabled_p ())
4359         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4360                          "reduction used in loop.\n");
4361       return NULL;
4362     }
4363
4364   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
4365      defined in the inner loop.  */
4366   if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
4367     {
4368       tree op1 = PHI_ARG_DEF (def_stmt, 0);
4369       if (gimple_phi_num_args (def_stmt) != 1
4370           || TREE_CODE (op1) != SSA_NAME)
4371         {
4372           if (dump_enabled_p ())
4373             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4374                              "unsupported phi node definition.\n");
4375
4376           return NULL;
4377         }
4378
4379       /* Verify there is an inner cycle composed of the PHI phi_use_stmt
4380          and the latch definition op1.  */
4381       gimple *def1 = SSA_NAME_DEF_STMT (op1);
4382       if (gimple_bb (def1)
4383           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4384           && loop->inner
4385           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
4386           && (is_gimple_assign (def1) || is_gimple_call (def1))
4387           && is_a <gphi *> (phi_use_stmt)
4388           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt))
4389           && (op1 == PHI_ARG_DEF_FROM_EDGE (phi_use_stmt,
4390                                             loop_latch_edge (loop->inner))))
4391         {
4392           if (dump_enabled_p ())
4393             report_vect_op (MSG_NOTE, def_stmt,
4394                             "detected double reduction: ");
4395
4396           *double_reduc = true;
4397           return def_stmt_info;
4398         }
4399
4400       return NULL;
4401     }
4402
4403   /* Look for the expression computing latch_def from then loop PHI result.  */
4404   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4405   code_helper code;
4406   if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
4407                             path))
4408     {
4409       STMT_VINFO_REDUC_CODE (phi_info) = code;
4410       if (code == COND_EXPR && !nested_in_vect_loop)
4411         STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
4412
4413       /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
4414          reduction chain for which the additional restriction is that
4415          all operations in the chain are the same.  */
4416       auto_vec<stmt_vec_info, 8> reduc_chain;
4417       unsigned i;
4418       bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
4419       for (i = path.length () - 1; i >= 1; --i)
4420         {
4421           gimple *stmt = USE_STMT (path[i].second);
4422           stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
4423           gimple_match_op op;
4424           if (!gimple_extract_op (stmt, &op))
4425             gcc_unreachable ();
4426           if (gassign *assign = dyn_cast<gassign *> (stmt))
4427             STMT_VINFO_REDUC_IDX (stmt_info)
4428               = path[i].second->use - gimple_assign_rhs1_ptr (assign);
4429           else
4430             {
4431               gcall *call = as_a<gcall *> (stmt);
4432               STMT_VINFO_REDUC_IDX (stmt_info)
4433                 = path[i].second->use - gimple_call_arg_ptr (call, 0);
4434             }
4435           bool leading_conversion = (CONVERT_EXPR_CODE_P (op.code)
4436                                      && (i == 1 || i == path.length () - 1));
4437           if ((op.code != code && !leading_conversion)
4438               /* We can only handle the final value in epilogue
4439                  generation for reduction chains.  */
4440               || (i != 1 && !has_single_use (gimple_get_lhs (stmt))))
4441             is_slp_reduc = false;
4442           /* For reduction chains we support a trailing/leading
4443              conversions.  We do not store those in the actual chain.  */
4444           if (leading_conversion)
4445             continue;
4446           reduc_chain.safe_push (stmt_info);
4447         }
4448       if (slp && is_slp_reduc && reduc_chain.length () > 1)
4449         {
4450           for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
4451             {
4452               REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
4453               REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
4454             }
4455           REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
4456           REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
4457
4458           /* Save the chain for further analysis in SLP detection.  */
4459           LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
4460           REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
4461
4462           *reduc_chain_p = true;
4463           if (dump_enabled_p ())
4464             dump_printf_loc (MSG_NOTE, vect_location,
4465                             "reduction: detected reduction chain\n");
4466         }
4467       else if (dump_enabled_p ())
4468         dump_printf_loc (MSG_NOTE, vect_location,
4469                          "reduction: detected reduction\n");
4470
4471       return def_stmt_info;
4472     }
4473
4474   if (dump_enabled_p ())
4475     dump_printf_loc (MSG_NOTE, vect_location,
4476                      "reduction: unknown pattern\n");
4477
4478   return NULL;
4479 }
4480
4481 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
4482    PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
4483    or -1 if not known.  */
4484
4485 static int
4486 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
4487 {
4488   int assumed_vf = vect_vf_for_cost (loop_vinfo);
4489   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
4490     {
4491       if (dump_enabled_p ())
4492         dump_printf_loc (MSG_NOTE, vect_location,
4493                          "cost model: epilogue peel iters set to vf/2 "
4494                          "because loop iterations are unknown .\n");
4495       return assumed_vf / 2;
4496     }
4497   else
4498     {
4499       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
4500       peel_iters_prologue = MIN (niters, peel_iters_prologue);
4501       int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
4502       /* If we need to peel for gaps, but no peeling is required, we have to
4503          peel VF iterations.  */
4504       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
4505         peel_iters_epilogue = assumed_vf;
4506       return peel_iters_epilogue;
4507     }
4508 }
4509
4510 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
4511 int
4512 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
4513                              int *peel_iters_epilogue,
4514                              stmt_vector_for_cost *scalar_cost_vec,
4515                              stmt_vector_for_cost *prologue_cost_vec,
4516                              stmt_vector_for_cost *epilogue_cost_vec)
4517 {
4518   int retval = 0;
4519
4520   *peel_iters_epilogue
4521     = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
4522
4523   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
4524     {
4525       /* If peeled iterations are known but number of scalar loop
4526          iterations are unknown, count a taken branch per peeled loop.  */
4527       if (peel_iters_prologue > 0)
4528         retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
4529                                    vect_prologue);
4530       if (*peel_iters_epilogue > 0)
4531         retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
4532                                     vect_epilogue);
4533     }
4534
4535   stmt_info_for_cost *si;
4536   int j;
4537   if (peel_iters_prologue)
4538     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4539       retval += record_stmt_cost (prologue_cost_vec,
4540                                   si->count * peel_iters_prologue,
4541                                   si->kind, si->stmt_info, si->misalign,
4542                                   vect_prologue);
4543   if (*peel_iters_epilogue)
4544     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4545       retval += record_stmt_cost (epilogue_cost_vec,
4546                                   si->count * *peel_iters_epilogue,
4547                                   si->kind, si->stmt_info, si->misalign,
4548                                   vect_epilogue);
4549
4550   return retval;
4551 }
4552
4553 /* Function vect_estimate_min_profitable_iters
4554
4555    Return the number of iterations required for the vector version of the
4556    loop to be profitable relative to the cost of the scalar version of the
4557    loop.
4558
4559    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
4560    of iterations for vectorization.  -1 value means loop vectorization
4561    is not profitable.  This returned value may be used for dynamic
4562    profitability check.
4563
4564    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
4565    for static check against estimated number of iterations.  */
4566
4567 static void
4568 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
4569                                     int *ret_min_profitable_niters,
4570                                     int *ret_min_profitable_estimate,
4571                                     unsigned *suggested_unroll_factor)
4572 {
4573   int min_profitable_iters;
4574   int min_profitable_estimate;
4575   int peel_iters_prologue;
4576   int peel_iters_epilogue;
4577   unsigned vec_inside_cost = 0;
4578   int vec_outside_cost = 0;
4579   unsigned vec_prologue_cost = 0;
4580   unsigned vec_epilogue_cost = 0;
4581   int scalar_single_iter_cost = 0;
4582   int scalar_outside_cost = 0;
4583   int assumed_vf = vect_vf_for_cost (loop_vinfo);
4584   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
4585   vector_costs *target_cost_data = loop_vinfo->vector_costs;
4586
4587   /* Cost model disabled.  */
4588   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
4589     {
4590       if (dump_enabled_p ())
4591         dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
4592       *ret_min_profitable_niters = 0;
4593       *ret_min_profitable_estimate = 0;
4594       return;
4595     }
4596
4597   /* Requires loop versioning tests to handle misalignment.  */
4598   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
4599     {
4600       /*  FIXME: Make cost depend on complexity of individual check.  */
4601       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
4602       (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4603       if (dump_enabled_p ())
4604         dump_printf (MSG_NOTE,
4605                      "cost model: Adding cost of checks for loop "
4606                      "versioning to treat misalignment.\n");
4607     }
4608
4609   /* Requires loop versioning with alias checks.  */
4610   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4611     {
4612       /*  FIXME: Make cost depend on complexity of individual check.  */
4613       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
4614       (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4615       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
4616       if (len)
4617         /* Count LEN - 1 ANDs and LEN comparisons.  */
4618         (void) add_stmt_cost (target_cost_data, len * 2 - 1,
4619                               scalar_stmt, vect_prologue);
4620       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
4621       if (len)
4622         {
4623           /* Count LEN - 1 ANDs and LEN comparisons.  */
4624           unsigned int nstmts = len * 2 - 1;
4625           /* +1 for each bias that needs adding.  */
4626           for (unsigned int i = 0; i < len; ++i)
4627             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
4628               nstmts += 1;
4629           (void) add_stmt_cost (target_cost_data, nstmts,
4630                                 scalar_stmt, vect_prologue);
4631         }
4632       if (dump_enabled_p ())
4633         dump_printf (MSG_NOTE,
4634                      "cost model: Adding cost of checks for loop "
4635                      "versioning aliasing.\n");
4636     }
4637
4638   /* Requires loop versioning with niter checks.  */
4639   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
4640     {
4641       /*  FIXME: Make cost depend on complexity of individual check.  */
4642       (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
4643                             NULL, NULL, NULL_TREE, 0, vect_prologue);
4644       if (dump_enabled_p ())
4645         dump_printf (MSG_NOTE,
4646                      "cost model: Adding cost of checks for loop "
4647                      "versioning niters.\n");
4648     }
4649
4650   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4651     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4652                           vect_prologue);
4653
4654   /* Count statements in scalar loop.  Using this as scalar cost for a single
4655      iteration for now.
4656
4657      TODO: Add outer loop support.
4658
4659      TODO: Consider assigning different costs to different scalar
4660      statements.  */
4661
4662   scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost ();
4663
4664   /* Add additional cost for the peeled instructions in prologue and epilogue
4665      loop.  (For fully-masked loops there will be no peeling.)
4666
4667      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4668      at compile-time - we assume it's vf/2 (the worst would be vf-1).
4669
4670      TODO: Build an expression that represents peel_iters for prologue and
4671      epilogue to be used in a run-time test.  */
4672
4673   bool prologue_need_br_taken_cost = false;
4674   bool prologue_need_br_not_taken_cost = false;
4675
4676   /* Calculate peel_iters_prologue.  */
4677   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4678     peel_iters_prologue = 0;
4679   else if (npeel < 0)
4680     {
4681       peel_iters_prologue = assumed_vf / 2;
4682       if (dump_enabled_p ())
4683         dump_printf (MSG_NOTE, "cost model: "
4684                      "prologue peel iters set to vf/2.\n");
4685
4686       /* If peeled iterations are unknown, count a taken branch and a not taken
4687          branch per peeled loop.  Even if scalar loop iterations are known,
4688          vector iterations are not known since peeled prologue iterations are
4689          not known.  Hence guards remain the same.  */
4690       prologue_need_br_taken_cost = true;
4691       prologue_need_br_not_taken_cost = true;
4692     }
4693   else
4694     {
4695       peel_iters_prologue = npeel;
4696       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4697         /* If peeled iterations are known but number of scalar loop
4698            iterations are unknown, count a taken branch per peeled loop.  */
4699         prologue_need_br_taken_cost = true;
4700     }
4701
4702   bool epilogue_need_br_taken_cost = false;
4703   bool epilogue_need_br_not_taken_cost = false;
4704
4705   /* Calculate peel_iters_epilogue.  */
4706   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4707     /* We need to peel exactly one iteration for gaps.  */
4708     peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4709   else if (npeel < 0)
4710     {
4711       /* If peeling for alignment is unknown, loop bound of main loop
4712          becomes unknown.  */
4713       peel_iters_epilogue = assumed_vf / 2;
4714       if (dump_enabled_p ())
4715         dump_printf (MSG_NOTE, "cost model: "
4716                      "epilogue peel iters set to vf/2 because "
4717                      "peeling for alignment is unknown.\n");
4718
4719       /* See the same reason above in peel_iters_prologue calculation.  */
4720       epilogue_need_br_taken_cost = true;
4721       epilogue_need_br_not_taken_cost = true;
4722     }
4723   else
4724     {
4725       peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4726       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4727         /* If peeled iterations are known but number of scalar loop
4728            iterations are unknown, count a taken branch per peeled loop.  */
4729         epilogue_need_br_taken_cost = true;
4730     }
4731
4732   stmt_info_for_cost *si;
4733   int j;
4734   /* Add costs associated with peel_iters_prologue.  */
4735   if (peel_iters_prologue)
4736     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4737       {
4738         (void) add_stmt_cost (target_cost_data,
4739                               si->count * peel_iters_prologue, si->kind,
4740                               si->stmt_info, si->node, si->vectype,
4741                               si->misalign, vect_prologue);
4742       }
4743
4744   /* Add costs associated with peel_iters_epilogue.  */
4745   if (peel_iters_epilogue)
4746     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4747       {
4748         (void) add_stmt_cost (target_cost_data,
4749                               si->count * peel_iters_epilogue, si->kind,
4750                               si->stmt_info, si->node, si->vectype,
4751                               si->misalign, vect_epilogue);
4752       }
4753
4754   /* Add possible cond_branch_taken/cond_branch_not_taken cost.  */
4755
4756   if (prologue_need_br_taken_cost)
4757     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4758                           vect_prologue);
4759
4760   if (prologue_need_br_not_taken_cost)
4761     (void) add_stmt_cost (target_cost_data, 1,
4762                           cond_branch_not_taken, vect_prologue);
4763
4764   if (epilogue_need_br_taken_cost)
4765     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4766                           vect_epilogue);
4767
4768   if (epilogue_need_br_not_taken_cost)
4769     (void) add_stmt_cost (target_cost_data, 1,
4770                           cond_branch_not_taken, vect_epilogue);
4771
4772   /* Take care of special costs for rgroup controls of partial vectors.  */
4773   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4774       && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4775           == vect_partial_vectors_avx512))
4776     {
4777       /* Calculate how many masks we need to generate.  */
4778       unsigned int num_masks = 0;
4779       bool need_saturation = false;
4780       for (auto rgm : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
4781         if (rgm.type)
4782           {
4783             unsigned nvectors = rgm.factor;
4784             num_masks += nvectors;
4785             if (TYPE_PRECISION (TREE_TYPE (rgm.compare_type))
4786                 < TYPE_PRECISION (LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo)))
4787               need_saturation = true;
4788           }
4789
4790       /* ???  The target isn't able to identify the costs below as
4791          producing masks so it cannot penaltize cases where we'd run
4792          out of mask registers for example.  */
4793
4794       /* ???  We are also failing to account for smaller vector masks
4795          we generate by splitting larger masks in vect_get_loop_mask.  */
4796
4797       /* In the worst case, we need to generate each mask in the prologue
4798          and in the loop body.  We need one splat per group and one
4799          compare per mask.
4800
4801          Sometimes the prologue mask will fold to a constant,
4802          so the actual prologue cost might be smaller.  However, it's
4803          simpler and safer to use the worst-case cost; if this ends up
4804          being the tie-breaker between vectorizing or not, then it's
4805          probably better not to vectorize.  */
4806       (void) add_stmt_cost (target_cost_data,
4807                             num_masks
4808                             + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4809                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4810                             vect_prologue);
4811       (void) add_stmt_cost (target_cost_data,
4812                             num_masks
4813                             + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4814                             vector_stmt, NULL, NULL, NULL_TREE, 0, vect_body);
4815
4816       /* When we need saturation we need it both in the prologue and
4817          the epilogue.  */
4818       if (need_saturation)
4819         {
4820           (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4821                                 NULL, NULL, NULL_TREE, 0, vect_prologue);
4822           (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4823                                 NULL, NULL, NULL_TREE, 0, vect_body);
4824         }
4825     }
4826   else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4827            && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4828                == vect_partial_vectors_while_ult))
4829     {
4830       /* Calculate how many masks we need to generate.  */
4831       unsigned int num_masks = 0;
4832       rgroup_controls *rgm;
4833       unsigned int num_vectors_m1;
4834       FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec,
4835                         num_vectors_m1, rgm)
4836         if (rgm->type)
4837           num_masks += num_vectors_m1 + 1;
4838       gcc_assert (num_masks > 0);
4839
4840       /* In the worst case, we need to generate each mask in the prologue
4841          and in the loop body.  One of the loop body mask instructions
4842          replaces the comparison in the scalar loop, and since we don't
4843          count the scalar comparison against the scalar body, we shouldn't
4844          count that vector instruction against the vector body either.
4845
4846          Sometimes we can use unpacks instead of generating prologue
4847          masks and sometimes the prologue mask will fold to a constant,
4848          so the actual prologue cost might be smaller.  However, it's
4849          simpler and safer to use the worst-case cost; if this ends up
4850          being the tie-breaker between vectorizing or not, then it's
4851          probably better not to vectorize.  */
4852       (void) add_stmt_cost (target_cost_data, num_masks,
4853                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4854                             vect_prologue);
4855       (void) add_stmt_cost (target_cost_data, num_masks - 1,
4856                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4857                             vect_body);
4858     }
4859   else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4860     {
4861       /* Referring to the functions vect_set_loop_condition_partial_vectors
4862          and vect_set_loop_controls_directly, we need to generate each
4863          length in the prologue and in the loop body if required. Although
4864          there are some possible optimizations, we consider the worst case
4865          here.  */
4866
4867       bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4868       signed char partial_load_store_bias
4869         = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4870       bool need_iterate_p
4871         = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4872            && !vect_known_niters_smaller_than_vf (loop_vinfo));
4873
4874       /* Calculate how many statements to be added.  */
4875       unsigned int prologue_stmts = 0;
4876       unsigned int body_stmts = 0;
4877
4878       rgroup_controls *rgc;
4879       unsigned int num_vectors_m1;
4880       FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4881         if (rgc->type)
4882           {
4883             /* May need one SHIFT for nitems_total computation.  */
4884             unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4885             if (nitems != 1 && !niters_known_p)
4886               prologue_stmts += 1;
4887
4888             /* May need one MAX and one MINUS for wrap around.  */
4889             if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4890               prologue_stmts += 2;
4891
4892             /* Need one MAX and one MINUS for each batch limit excepting for
4893                the 1st one.  */
4894             prologue_stmts += num_vectors_m1 * 2;
4895
4896             unsigned int num_vectors = num_vectors_m1 + 1;
4897
4898             /* Need to set up lengths in prologue, only one MIN required
4899                for each since start index is zero.  */
4900             prologue_stmts += num_vectors;
4901
4902             /* If we have a non-zero partial load bias, we need one PLUS
4903                to adjust the load length.  */
4904             if (partial_load_store_bias != 0)
4905               body_stmts += 1;
4906
4907             unsigned int length_update_cost = 0;
4908             if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
4909               /* For decrement IV style, Each only need a single SELECT_VL
4910                  or MIN since beginning to calculate the number of elements
4911                  need to be processed in current iteration.  */
4912               length_update_cost = 1;
4913             else
4914               /* For increment IV stype, Each may need two MINs and one MINUS to
4915                  update lengths in body for next iteration.  */
4916               length_update_cost = 3;
4917
4918             if (need_iterate_p)
4919               body_stmts += length_update_cost * num_vectors;
4920           }
4921
4922       (void) add_stmt_cost (target_cost_data, prologue_stmts,
4923                             scalar_stmt, vect_prologue);
4924       (void) add_stmt_cost (target_cost_data, body_stmts,
4925                             scalar_stmt, vect_body);
4926     }
4927
4928   /* FORNOW: The scalar outside cost is incremented in one of the
4929      following ways:
4930
4931      1. The vectorizer checks for alignment and aliasing and generates
4932      a condition that allows dynamic vectorization.  A cost model
4933      check is ANDED with the versioning condition.  Hence scalar code
4934      path now has the added cost of the versioning check.
4935
4936        if (cost > th & versioning_check)
4937          jmp to vector code
4938
4939      Hence run-time scalar is incremented by not-taken branch cost.
4940
4941      2. The vectorizer then checks if a prologue is required.  If the
4942      cost model check was not done before during versioning, it has to
4943      be done before the prologue check.
4944
4945        if (cost <= th)
4946          prologue = scalar_iters
4947        if (prologue == 0)
4948          jmp to vector code
4949        else
4950          execute prologue
4951        if (prologue == num_iters)
4952          go to exit
4953
4954      Hence the run-time scalar cost is incremented by a taken branch,
4955      plus a not-taken branch, plus a taken branch cost.
4956
4957      3. The vectorizer then checks if an epilogue is required.  If the
4958      cost model check was not done before during prologue check, it
4959      has to be done with the epilogue check.
4960
4961        if (prologue == 0)
4962          jmp to vector code
4963        else
4964          execute prologue
4965        if (prologue == num_iters)
4966          go to exit
4967        vector code:
4968          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4969            jmp to epilogue
4970
4971      Hence the run-time scalar cost should be incremented by 2 taken
4972      branches.
4973
4974      TODO: The back end may reorder the BBS's differently and reverse
4975      conditions/branch directions.  Change the estimates below to
4976      something more reasonable.  */
4977
4978   /* If the number of iterations is known and we do not do versioning, we can
4979      decide whether to vectorize at compile time.  Hence the scalar version
4980      do not carry cost model guard costs.  */
4981   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4982       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4983     {
4984       /* Cost model check occurs at versioning.  */
4985       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4986         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4987       else
4988         {
4989           /* Cost model check occurs at prologue generation.  */
4990           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4991             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4992               + vect_get_stmt_cost (cond_branch_not_taken);
4993           /* Cost model check occurs at epilogue generation.  */
4994           else
4995             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4996         }
4997     }
4998
4999   /* Complete the target-specific cost calculations.  */
5000   finish_cost (loop_vinfo->vector_costs, loop_vinfo->scalar_costs,
5001                &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost,
5002                suggested_unroll_factor);
5003
5004   if (suggested_unroll_factor && *suggested_unroll_factor > 1
5005       && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
5006       && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
5007                     *suggested_unroll_factor,
5008                     LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
5009     {
5010       if (dump_enabled_p ())
5011         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5012                          "can't unroll as unrolled vectorization factor larger"
5013                          " than maximum vectorization factor: "
5014                          HOST_WIDE_INT_PRINT_UNSIGNED "\n",
5015                          LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
5016       *suggested_unroll_factor = 1;
5017     }
5018
5019   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
5020
5021   if (dump_enabled_p ())
5022     {
5023       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
5024       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
5025                    vec_inside_cost);
5026       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
5027                    vec_prologue_cost);
5028       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
5029                    vec_epilogue_cost);
5030       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
5031                    scalar_single_iter_cost);
5032       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
5033                    scalar_outside_cost);
5034       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
5035                    vec_outside_cost);
5036       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
5037                    peel_iters_prologue);
5038       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
5039                    peel_iters_epilogue);
5040     }
5041
5042   /* Calculate number of iterations required to make the vector version
5043      profitable, relative to the loop bodies only.  The following condition
5044      must hold true:
5045      SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
5046      where
5047      SIC = scalar iteration cost, VIC = vector iteration cost,
5048      VOC = vector outside cost, VF = vectorization factor,
5049      NPEEL = prologue iterations + epilogue iterations,
5050      SOC = scalar outside cost for run time cost model check.  */
5051
5052   int saving_per_viter = (scalar_single_iter_cost * assumed_vf
5053                           - vec_inside_cost);
5054   if (saving_per_viter <= 0)
5055     {
5056       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
5057         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
5058                     "vectorization did not happen for a simd loop");
5059
5060       if (dump_enabled_p ())
5061         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5062                          "cost model: the vector iteration cost = %d "
5063                          "divided by the scalar iteration cost = %d "
5064                          "is greater or equal to the vectorization factor = %d"
5065                          ".\n",
5066                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
5067       *ret_min_profitable_niters = -1;
5068       *ret_min_profitable_estimate = -1;
5069       return;
5070     }
5071
5072   /* ??? The "if" arm is written to handle all cases; see below for what
5073      we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
5074   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5075     {
5076       /* Rewriting the condition above in terms of the number of
5077          vector iterations (vniters) rather than the number of
5078          scalar iterations (niters) gives:
5079
5080          SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
5081
5082          <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
5083
5084          For integer N, X and Y when X > 0:
5085
5086          N * X > Y <==> N >= (Y /[floor] X) + 1.  */
5087       int outside_overhead = (vec_outside_cost
5088                               - scalar_single_iter_cost * peel_iters_prologue
5089                               - scalar_single_iter_cost * peel_iters_epilogue
5090                               - scalar_outside_cost);
5091       /* We're only interested in cases that require at least one
5092          vector iteration.  */
5093       int min_vec_niters = 1;
5094       if (outside_overhead > 0)
5095         min_vec_niters = outside_overhead / saving_per_viter + 1;
5096
5097       if (dump_enabled_p ())
5098         dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
5099                      min_vec_niters);
5100
5101       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5102         {
5103           /* Now that we know the minimum number of vector iterations,
5104              find the minimum niters for which the scalar cost is larger:
5105
5106              SIC * niters > VIC * vniters + VOC - SOC
5107
5108              We know that the minimum niters is no more than
5109              vniters * VF + NPEEL, but it might be (and often is) less
5110              than that if a partial vector iteration is cheaper than the
5111              equivalent scalar code.  */
5112           int threshold = (vec_inside_cost * min_vec_niters
5113                            + vec_outside_cost
5114                            - scalar_outside_cost);
5115           if (threshold <= 0)
5116             min_profitable_iters = 1;
5117           else
5118             min_profitable_iters = threshold / scalar_single_iter_cost + 1;
5119         }
5120       else
5121         /* Convert the number of vector iterations into a number of
5122            scalar iterations.  */
5123         min_profitable_iters = (min_vec_niters * assumed_vf
5124                                 + peel_iters_prologue
5125                                 + peel_iters_epilogue);
5126     }
5127   else
5128     {
5129       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
5130                               * assumed_vf
5131                               - vec_inside_cost * peel_iters_prologue
5132                               - vec_inside_cost * peel_iters_epilogue);
5133       if (min_profitable_iters <= 0)
5134         min_profitable_iters = 0;
5135       else
5136         {
5137           min_profitable_iters /= saving_per_viter;
5138
5139           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
5140               <= (((int) vec_inside_cost * min_profitable_iters)
5141                   + (((int) vec_outside_cost - scalar_outside_cost)
5142                      * assumed_vf)))
5143             min_profitable_iters++;
5144         }
5145     }
5146
5147   if (dump_enabled_p ())
5148     dump_printf (MSG_NOTE,
5149                  "  Calculated minimum iters for profitability: %d\n",
5150                  min_profitable_iters);
5151
5152   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
5153       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
5154     /* We want the vectorized loop to execute at least once.  */
5155     min_profitable_iters = assumed_vf + peel_iters_prologue;
5156   else if (min_profitable_iters < peel_iters_prologue)
5157     /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
5158        vectorized loop executes at least once.  */
5159     min_profitable_iters = peel_iters_prologue;
5160
5161   if (dump_enabled_p ())
5162     dump_printf_loc (MSG_NOTE, vect_location,
5163                      "  Runtime profitability threshold = %d\n",
5164                      min_profitable_iters);
5165
5166   *ret_min_profitable_niters = min_profitable_iters;
5167
5168   /* Calculate number of iterations required to make the vector version
5169      profitable, relative to the loop bodies only.
5170
5171      Non-vectorized variant is SIC * niters and it must win over vector
5172      variant on the expected loop trip count.  The following condition must hold true:
5173      SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
5174
5175   if (vec_outside_cost <= 0)
5176     min_profitable_estimate = 0;
5177   /* ??? This "else if" arm is written to handle all cases; see below for
5178      what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
5179   else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5180     {
5181       /* This is a repeat of the code above, but with + SOC rather
5182          than - SOC.  */
5183       int outside_overhead = (vec_outside_cost
5184                               - scalar_single_iter_cost * peel_iters_prologue
5185                               - scalar_single_iter_cost * peel_iters_epilogue
5186                               + scalar_outside_cost);
5187       int min_vec_niters = 1;
5188       if (outside_overhead > 0)
5189         min_vec_niters = outside_overhead / saving_per_viter + 1;
5190
5191       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5192         {
5193           int threshold = (vec_inside_cost * min_vec_niters
5194                            + vec_outside_cost
5195                            + scalar_outside_cost);
5196           min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
5197         }
5198       else
5199         min_profitable_estimate = (min_vec_niters * assumed_vf
5200                                    + peel_iters_prologue
5201                                    + peel_iters_epilogue);
5202     }
5203   else
5204     {
5205       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
5206                                  * assumed_vf
5207                                  - vec_inside_cost * peel_iters_prologue
5208                                  - vec_inside_cost * peel_iters_epilogue)
5209                                  / ((scalar_single_iter_cost * assumed_vf)
5210                                    - vec_inside_cost);
5211     }
5212   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
5213   if (dump_enabled_p ())
5214     dump_printf_loc (MSG_NOTE, vect_location,
5215                      "  Static estimate profitability threshold = %d\n",
5216                      min_profitable_estimate);
5217
5218   *ret_min_profitable_estimate = min_profitable_estimate;
5219 }
5220
5221 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
5222    vector elements (not bits) for a vector with NELT elements.  */
5223 static void
5224 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
5225                               vec_perm_builder *sel)
5226 {
5227   /* The encoding is a single stepped pattern.  Any wrap-around is handled
5228      by vec_perm_indices.  */
5229   sel->new_vector (nelt, 1, 3);
5230   for (unsigned int i = 0; i < 3; i++)
5231     sel->quick_push (i + offset);
5232 }
5233
5234 /* Checks whether the target supports whole-vector shifts for vectors of mode
5235    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
5236    it supports vec_perm_const with masks for all necessary shift amounts.  */
5237 static bool
5238 have_whole_vector_shift (machine_mode mode)
5239 {
5240   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
5241     return true;
5242
5243   /* Variable-length vectors should be handled via the optab.  */
5244   unsigned int nelt;
5245   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5246     return false;
5247
5248   vec_perm_builder sel;
5249   vec_perm_indices indices;
5250   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
5251     {
5252       calc_vec_perm_mask_for_shift (i, nelt, &sel);
5253       indices.new_vector (sel, 2, nelt);
5254       if (!can_vec_perm_const_p (mode, mode, indices, false))
5255         return false;
5256     }
5257   return true;
5258 }
5259
5260 /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
5261    multiplication operands have differing signs and (b) we intend
5262    to emulate the operation using a series of signed DOT_PROD_EXPRs.
5263    See vect_emulate_mixed_dot_prod for the actual sequence used.  */
5264
5265 static bool
5266 vect_is_emulated_mixed_dot_prod (loop_vec_info loop_vinfo,
5267                                  stmt_vec_info stmt_info)
5268 {
5269   gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
5270   if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
5271     return false;
5272
5273   tree rhs1 = gimple_assign_rhs1 (assign);
5274   tree rhs2 = gimple_assign_rhs2 (assign);
5275   if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
5276     return false;
5277
5278   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5279   gcc_assert (reduc_info->is_reduc_info);
5280   return !directly_supported_p (DOT_PROD_EXPR,
5281                                 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info),
5282                                 optab_vector_mixed_sign);
5283 }
5284
5285 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
5286    functions. Design better to avoid maintenance issues.  */
5287
5288 /* Function vect_model_reduction_cost.
5289
5290    Models cost for a reduction operation, including the vector ops
5291    generated within the strip-mine loop in some cases, the initial
5292    definition before the loop, and the epilogue code that must be generated.  */
5293
5294 static void
5295 vect_model_reduction_cost (loop_vec_info loop_vinfo,
5296                            stmt_vec_info stmt_info, internal_fn reduc_fn,
5297                            vect_reduction_type reduction_type,
5298                            int ncopies, stmt_vector_for_cost *cost_vec)
5299 {
5300   int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
5301   tree vectype;
5302   machine_mode mode;
5303   class loop *loop = NULL;
5304
5305   if (loop_vinfo)
5306     loop = LOOP_VINFO_LOOP (loop_vinfo);
5307
5308   /* Condition reductions generate two reductions in the loop.  */
5309   if (reduction_type == COND_REDUCTION)
5310     ncopies *= 2;
5311
5312   vectype = STMT_VINFO_VECTYPE (stmt_info);
5313   mode = TYPE_MODE (vectype);
5314   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5315
5316   gimple_match_op op;
5317   if (!gimple_extract_op (orig_stmt_info->stmt, &op))
5318     gcc_unreachable ();
5319
5320   bool emulated_mixed_dot_prod
5321     = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
5322   if (reduction_type == EXTRACT_LAST_REDUCTION)
5323     /* No extra instructions are needed in the prologue.  The loop body
5324        operations are costed in vectorizable_condition.  */
5325     inside_cost = 0;
5326   else if (reduction_type == FOLD_LEFT_REDUCTION)
5327     {
5328       /* No extra instructions needed in the prologue.  */
5329       prologue_cost = 0;
5330
5331       if (reduc_fn != IFN_LAST)
5332         /* Count one reduction-like operation per vector.  */
5333         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
5334                                         stmt_info, 0, vect_body);
5335       else
5336         {
5337           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
5338           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
5339           inside_cost = record_stmt_cost (cost_vec, nelements,
5340                                           vec_to_scalar, stmt_info, 0,
5341                                           vect_body);
5342           inside_cost += record_stmt_cost (cost_vec, nelements,
5343                                            scalar_stmt, stmt_info, 0,
5344                                            vect_body);
5345         }
5346     }
5347   else
5348     {
5349       /* Add in the cost of the initial definitions.  */
5350       int prologue_stmts;
5351       if (reduction_type == COND_REDUCTION)
5352         /* For cond reductions we have four vectors: initial index, step,
5353            initial result of the data reduction, initial value of the index
5354            reduction.  */
5355         prologue_stmts = 4;
5356       else if (emulated_mixed_dot_prod)
5357         /* We need the initial reduction value and two invariants:
5358            one that contains the minimum signed value and one that
5359            contains half of its negative.  */
5360         prologue_stmts = 3;
5361       else
5362         prologue_stmts = 1;
5363       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
5364                                          scalar_to_vec, stmt_info, 0,
5365                                          vect_prologue);
5366     }
5367
5368   /* Determine cost of epilogue code.
5369
5370      We have a reduction operator that will reduce the vector in one statement.
5371      Also requires scalar extract.  */
5372
5373   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
5374     {
5375       if (reduc_fn != IFN_LAST)
5376         {
5377           if (reduction_type == COND_REDUCTION)
5378             {
5379               /* An EQ stmt and an COND_EXPR stmt.  */
5380               epilogue_cost += record_stmt_cost (cost_vec, 2,
5381                                                  vector_stmt, stmt_info, 0,
5382                                                  vect_epilogue);
5383               /* Reduction of the max index and a reduction of the found
5384                  values.  */
5385               epilogue_cost += record_stmt_cost (cost_vec, 2,
5386                                                  vec_to_scalar, stmt_info, 0,
5387                                                  vect_epilogue);
5388               /* A broadcast of the max value.  */
5389               epilogue_cost += record_stmt_cost (cost_vec, 1,
5390                                                  scalar_to_vec, stmt_info, 0,
5391                                                  vect_epilogue);
5392             }
5393           else
5394             {
5395               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
5396                                                  stmt_info, 0, vect_epilogue);
5397               epilogue_cost += record_stmt_cost (cost_vec, 1,
5398                                                  vec_to_scalar, stmt_info, 0,
5399                                                  vect_epilogue);
5400             }
5401         }
5402       else if (reduction_type == COND_REDUCTION)
5403         {
5404           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
5405           /* Extraction of scalar elements.  */
5406           epilogue_cost += record_stmt_cost (cost_vec,
5407                                              2 * estimated_nunits,
5408                                              vec_to_scalar, stmt_info, 0,
5409                                              vect_epilogue);
5410           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
5411           epilogue_cost += record_stmt_cost (cost_vec,
5412                                              2 * estimated_nunits - 3,
5413                                              scalar_stmt, stmt_info, 0,
5414                                              vect_epilogue);
5415         }
5416       else if (reduction_type == EXTRACT_LAST_REDUCTION
5417                || reduction_type == FOLD_LEFT_REDUCTION)
5418         /* No extra instructions need in the epilogue.  */
5419         ;
5420       else
5421         {
5422           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5423           tree bitsize = TYPE_SIZE (op.type);
5424           int element_bitsize = tree_to_uhwi (bitsize);
5425           int nelements = vec_size_in_bits / element_bitsize;
5426
5427           if (op.code == COND_EXPR)
5428             op.code = MAX_EXPR;
5429
5430           /* We have a whole vector shift available.  */
5431           if (VECTOR_MODE_P (mode)
5432               && directly_supported_p (op.code, vectype)
5433               && have_whole_vector_shift (mode))
5434             {
5435               /* Final reduction via vector shifts and the reduction operator.
5436                  Also requires scalar extract.  */
5437               epilogue_cost += record_stmt_cost (cost_vec,
5438                                                  exact_log2 (nelements) * 2,
5439                                                  vector_stmt, stmt_info, 0,
5440                                                  vect_epilogue);
5441               epilogue_cost += record_stmt_cost (cost_vec, 1,
5442                                                  vec_to_scalar, stmt_info, 0,
5443                                                  vect_epilogue);
5444             }
5445           else
5446             /* Use extracts and reduction op for final reduction.  For N
5447                elements, we have N extracts and N-1 reduction ops.  */
5448             epilogue_cost += record_stmt_cost (cost_vec,
5449                                                nelements + nelements - 1,
5450                                                vector_stmt, stmt_info, 0,
5451                                                vect_epilogue);
5452         }
5453     }
5454
5455   if (dump_enabled_p ())
5456     dump_printf (MSG_NOTE,
5457                  "vect_model_reduction_cost: inside_cost = %d, "
5458                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
5459                  prologue_cost, epilogue_cost);
5460 }
5461
5462 /* SEQ is a sequence of instructions that initialize the reduction
5463    described by REDUC_INFO.  Emit them in the appropriate place.  */
5464
5465 static void
5466 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
5467                                 stmt_vec_info reduc_info, gimple *seq)
5468 {
5469   if (reduc_info->reused_accumulator)
5470     {
5471       /* When reusing an accumulator from the main loop, we only need
5472          initialization instructions if the main loop can be skipped.
5473          In that case, emit the initialization instructions at the end
5474          of the guard block that does the skip.  */
5475       edge skip_edge = loop_vinfo->skip_main_loop_edge;
5476       gcc_assert (skip_edge);
5477       gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
5478       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
5479     }
5480   else
5481     {
5482       /* The normal case: emit the initialization instructions on the
5483          preheader edge.  */
5484       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5485       gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
5486     }
5487 }
5488
5489 /* Function get_initial_def_for_reduction
5490
5491    Input:
5492    REDUC_INFO - the info_for_reduction
5493    INIT_VAL - the initial value of the reduction variable
5494    NEUTRAL_OP - a value that has no effect on the reduction, as per
5495                 neutral_op_for_reduction
5496
5497    Output:
5498    Return a vector variable, initialized according to the operation that
5499         STMT_VINFO performs. This vector will be used as the initial value
5500         of the vector of partial results.
5501
5502    The value we need is a vector in which element 0 has value INIT_VAL
5503    and every other element has value NEUTRAL_OP.  */
5504
5505 static tree
5506 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
5507                                stmt_vec_info reduc_info,
5508                                tree init_val, tree neutral_op)
5509 {
5510   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5511   tree scalar_type = TREE_TYPE (init_val);
5512   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
5513   tree init_def;
5514   gimple_seq stmts = NULL;
5515
5516   gcc_assert (vectype);
5517
5518   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
5519               || SCALAR_FLOAT_TYPE_P (scalar_type));
5520
5521   gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
5522               || loop == (gimple_bb (reduc_info->stmt))->loop_father);
5523
5524   if (operand_equal_p (init_val, neutral_op))
5525     {
5526       /* If both elements are equal then the vector described above is
5527          just a splat.  */
5528       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5529       init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
5530     }
5531   else
5532     {
5533       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5534       init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
5535       if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
5536         {
5537           /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
5538              element 0.  */
5539           init_def = gimple_build_vector_from_val (&stmts, vectype,
5540                                                    neutral_op);
5541           init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
5542                                    vectype, init_def, init_val);
5543         }
5544       else
5545         {
5546           /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}.  */
5547           tree_vector_builder elts (vectype, 1, 2);
5548           elts.quick_push (init_val);
5549           elts.quick_push (neutral_op);
5550           init_def = gimple_build_vector (&stmts, &elts);
5551         }
5552     }
5553
5554   if (stmts)
5555     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
5556   return init_def;
5557 }
5558
5559 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
5560    which performs a reduction involving GROUP_SIZE scalar statements.
5561    NUMBER_OF_VECTORS is the number of vector defs to create.  If NEUTRAL_OP
5562    is nonnull, introducing extra elements of that value will not change the
5563    result.  */
5564
5565 static void
5566 get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
5567                                 stmt_vec_info reduc_info,
5568                                 vec<tree> *vec_oprnds,
5569                                 unsigned int number_of_vectors,
5570                                 unsigned int group_size, tree neutral_op)
5571 {
5572   vec<tree> &initial_values = reduc_info->reduc_initial_values;
5573   unsigned HOST_WIDE_INT nunits;
5574   unsigned j, number_of_places_left_in_vector;
5575   tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
5576   unsigned int i;
5577
5578   gcc_assert (group_size == initial_values.length () || neutral_op);
5579
5580   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5581      created vectors. It is greater than 1 if unrolling is performed.
5582
5583      For example, we have two scalar operands, s1 and s2 (e.g., group of
5584      strided accesses of size two), while NUNITS is four (i.e., four scalars
5585      of this type can be packed in a vector).  The output vector will contain
5586      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
5587      will be 2).
5588
5589      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
5590      vectors containing the operands.
5591
5592      For example, NUNITS is four as before, and the group size is 8
5593      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
5594      {s5, s6, s7, s8}.  */
5595
5596   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
5597     nunits = group_size;
5598
5599   number_of_places_left_in_vector = nunits;
5600   bool constant_p = true;
5601   tree_vector_builder elts (vector_type, nunits, 1);
5602   elts.quick_grow (nunits);
5603   gimple_seq ctor_seq = NULL;
5604   for (j = 0; j < nunits * number_of_vectors; ++j)
5605     {
5606       tree op;
5607       i = j % group_size;
5608
5609       /* Get the def before the loop.  In reduction chain we have only
5610          one initial value.  Else we have as many as PHIs in the group.  */
5611       if (i >= initial_values.length () || (j > i && neutral_op))
5612         op = neutral_op;
5613       else
5614         op = initial_values[i];
5615
5616       /* Create 'vect_ = {op0,op1,...,opn}'.  */
5617       number_of_places_left_in_vector--;
5618       elts[nunits - number_of_places_left_in_vector - 1] = op;
5619       if (!CONSTANT_CLASS_P (op))
5620         constant_p = false;
5621
5622       if (number_of_places_left_in_vector == 0)
5623         {
5624           tree init;
5625           if (constant_p && !neutral_op
5626               ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
5627               : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
5628             /* Build the vector directly from ELTS.  */
5629             init = gimple_build_vector (&ctor_seq, &elts);
5630           else if (neutral_op)
5631             {
5632               /* Build a vector of the neutral value and shift the
5633                  other elements into place.  */
5634               init = gimple_build_vector_from_val (&ctor_seq, vector_type,
5635                                                    neutral_op);
5636               int k = nunits;
5637               while (k > 0 && elts[k - 1] == neutral_op)
5638                 k -= 1;
5639               while (k > 0)
5640                 {
5641                   k -= 1;
5642                   init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
5643                                        vector_type, init, elts[k]);
5644                 }
5645             }
5646           else
5647             {
5648               /* First time round, duplicate ELTS to fill the
5649                  required number of vectors.  */
5650               duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
5651                                         elts, number_of_vectors, *vec_oprnds);
5652               break;
5653             }
5654           vec_oprnds->quick_push (init);
5655
5656           number_of_places_left_in_vector = nunits;
5657           elts.new_vector (vector_type, nunits, 1);
5658           elts.quick_grow (nunits);
5659           constant_p = true;
5660         }
5661     }
5662   if (ctor_seq != NULL)
5663     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
5664 }
5665
5666 /* For a statement STMT_INFO taking part in a reduction operation return
5667    the stmt_vec_info the meta information is stored on.  */
5668
5669 stmt_vec_info
5670 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
5671 {
5672   stmt_info = vect_orig_stmt (stmt_info);
5673   gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
5674   if (!is_a <gphi *> (stmt_info->stmt)
5675       || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
5676     stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5677   gphi *phi = as_a <gphi *> (stmt_info->stmt);
5678   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5679     {
5680       if (gimple_phi_num_args (phi) == 1)
5681         stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5682     }
5683   else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
5684     {
5685       stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
5686       if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
5687         stmt_info = info;
5688     }
5689   return stmt_info;
5690 }
5691
5692 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
5693    REDUC_INFO can build on.  Adjust REDUC_INFO and return true if so, otherwise
5694    return false.  */
5695
5696 static bool
5697 vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
5698                                 stmt_vec_info reduc_info)
5699 {
5700   loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
5701   if (!main_loop_vinfo)
5702     return false;
5703
5704   if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
5705     return false;
5706
5707   unsigned int num_phis = reduc_info->reduc_initial_values.length ();
5708   auto_vec<tree, 16> main_loop_results (num_phis);
5709   auto_vec<tree, 16> initial_values (num_phis);
5710   if (edge main_loop_edge = loop_vinfo->main_loop_edge)
5711     {
5712       /* The epilogue loop can be entered either from the main loop or
5713          from an earlier guard block.  */
5714       edge skip_edge = loop_vinfo->skip_main_loop_edge;
5715       for (tree incoming_value : reduc_info->reduc_initial_values)
5716         {
5717           /* Look for:
5718
5719                INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
5720                                     INITIAL_VALUE(guard block)>.  */
5721           gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
5722
5723           gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
5724           gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
5725
5726           tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
5727           tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
5728
5729           main_loop_results.quick_push (from_main_loop);
5730           initial_values.quick_push (from_skip);
5731         }
5732     }
5733   else
5734     /* The main loop dominates the epilogue loop.  */
5735     main_loop_results.splice (reduc_info->reduc_initial_values);
5736
5737   /* See if the main loop has the kind of accumulator we need.  */
5738   vect_reusable_accumulator *accumulator
5739     = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
5740   if (!accumulator
5741       || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
5742       || !std::equal (main_loop_results.begin (), main_loop_results.end (),
5743                       accumulator->reduc_info->reduc_scalar_results.begin ()))
5744     return false;
5745
5746   /* Handle the case where we can reduce wider vectors to narrower ones.  */
5747   tree vectype = STMT_VINFO_VECTYPE (reduc_info);
5748   tree old_vectype = TREE_TYPE (accumulator->reduc_input);
5749   unsigned HOST_WIDE_INT m;
5750   if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
5751                             TYPE_VECTOR_SUBPARTS (vectype), &m))
5752     return false;
5753   /* Check the intermediate vector types and operations are available.  */
5754   tree prev_vectype = old_vectype;
5755   poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
5756   while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
5757     {
5758       intermediate_nunits = exact_div (intermediate_nunits, 2);
5759       tree intermediate_vectype = get_related_vectype_for_scalar_type
5760         (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
5761       if (!intermediate_vectype
5762           || !directly_supported_p (STMT_VINFO_REDUC_CODE (reduc_info),
5763                                     intermediate_vectype)
5764           || !can_vec_extract (TYPE_MODE (prev_vectype),
5765                                TYPE_MODE (intermediate_vectype)))
5766         return false;
5767       prev_vectype = intermediate_vectype;
5768     }
5769
5770   /* Non-SLP reductions might apply an adjustment after the reduction
5771      operation, in order to simplify the initialization of the accumulator.
5772      If the epilogue loop carries on from where the main loop left off,
5773      it should apply the same adjustment to the final reduction result.
5774
5775      If the epilogue loop can also be entered directly (rather than via
5776      the main loop), we need to be able to handle that case in the same way,
5777      with the same adjustment.  (In principle we could add a PHI node
5778      to select the correct adjustment, but in practice that shouldn't be
5779      necessary.)  */
5780   tree main_adjustment
5781     = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5782   if (loop_vinfo->main_loop_edge && main_adjustment)
5783     {
5784       gcc_assert (num_phis == 1);
5785       tree initial_value = initial_values[0];
5786       /* Check that we can use INITIAL_VALUE as the adjustment and
5787          initialize the accumulator with a neutral value instead.  */
5788       if (!operand_equal_p (initial_value, main_adjustment))
5789         return false;
5790       code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5791       initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
5792                                                     code, initial_value);
5793     }
5794   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5795   reduc_info->reduc_initial_values.truncate (0);
5796   reduc_info->reduc_initial_values.splice (initial_values);
5797   reduc_info->reused_accumulator = accumulator;
5798   return true;
5799 }
5800
5801 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5802    CODE emitting stmts before GSI.  Returns a vector def of VECTYPE.  */
5803
5804 static tree
5805 vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5806                             gimple_seq *seq)
5807 {
5808   unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5809   unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5810   tree stype = TREE_TYPE (vectype);
5811   tree new_temp = vec_def;
5812   while (nunits > nunits1)
5813     {
5814       nunits /= 2;
5815       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5816                                                            stype, nunits);
5817       unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5818
5819       /* The target has to make sure we support lowpart/highpart
5820          extraction, either via direct vector extract or through
5821          an integer mode punning.  */
5822       tree dst1, dst2;
5823       gimple *epilog_stmt;
5824       if (convert_optab_handler (vec_extract_optab,
5825                                  TYPE_MODE (TREE_TYPE (new_temp)),
5826                                  TYPE_MODE (vectype1))
5827           != CODE_FOR_nothing)
5828         {
5829           /* Extract sub-vectors directly once vec_extract becomes
5830              a conversion optab.  */
5831           dst1 = make_ssa_name (vectype1);
5832           epilog_stmt
5833               = gimple_build_assign (dst1, BIT_FIELD_REF,
5834                                      build3 (BIT_FIELD_REF, vectype1,
5835                                              new_temp, TYPE_SIZE (vectype1),
5836                                              bitsize_int (0)));
5837           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5838           dst2 =  make_ssa_name (vectype1);
5839           epilog_stmt
5840               = gimple_build_assign (dst2, BIT_FIELD_REF,
5841                                      build3 (BIT_FIELD_REF, vectype1,
5842                                              new_temp, TYPE_SIZE (vectype1),
5843                                              bitsize_int (bitsize)));
5844           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5845         }
5846       else
5847         {
5848           /* Extract via punning to appropriately sized integer mode
5849              vector.  */
5850           tree eltype = build_nonstandard_integer_type (bitsize, 1);
5851           tree etype = build_vector_type (eltype, 2);
5852           gcc_assert (convert_optab_handler (vec_extract_optab,
5853                                              TYPE_MODE (etype),
5854                                              TYPE_MODE (eltype))
5855                       != CODE_FOR_nothing);
5856           tree tem = make_ssa_name (etype);
5857           epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5858                                              build1 (VIEW_CONVERT_EXPR,
5859                                                      etype, new_temp));
5860           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5861           new_temp = tem;
5862           tem = make_ssa_name (eltype);
5863           epilog_stmt
5864               = gimple_build_assign (tem, BIT_FIELD_REF,
5865                                      build3 (BIT_FIELD_REF, eltype,
5866                                              new_temp, TYPE_SIZE (eltype),
5867                                              bitsize_int (0)));
5868           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5869           dst1 = make_ssa_name (vectype1);
5870           epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5871                                              build1 (VIEW_CONVERT_EXPR,
5872                                                      vectype1, tem));
5873           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5874           tem = make_ssa_name (eltype);
5875           epilog_stmt
5876               = gimple_build_assign (tem, BIT_FIELD_REF,
5877                                      build3 (BIT_FIELD_REF, eltype,
5878                                              new_temp, TYPE_SIZE (eltype),
5879                                              bitsize_int (bitsize)));
5880           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5881           dst2 =  make_ssa_name (vectype1);
5882           epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5883                                              build1 (VIEW_CONVERT_EXPR,
5884                                                      vectype1, tem));
5885           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5886         }
5887
5888       new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5889     }
5890
5891   return new_temp;
5892 }
5893
5894 /* Retrieves the definining statement to be used for a reduction.
5895    For MAIN_EXIT_P we use the current VEC_STMTs and otherwise we look at
5896    the reduction definitions.  */
5897
5898 tree
5899 vect_get_vect_def (stmt_vec_info reduc_info, slp_tree slp_node,
5900                    slp_instance slp_node_instance, bool main_exit_p, unsigned i,
5901                    vec <gimple *> &vec_stmts)
5902 {
5903   tree def;
5904
5905   if (slp_node)
5906     {
5907       if (!main_exit_p)
5908         slp_node = slp_node_instance->reduc_phis;
5909       def = vect_get_slp_vect_def (slp_node, i);
5910     }
5911   else
5912     {
5913       if (!main_exit_p)
5914         reduc_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (reduc_info));
5915       vec_stmts = STMT_VINFO_VEC_STMTS (reduc_info);
5916       def = gimple_get_lhs (vec_stmts[0]);
5917     }
5918
5919   return def;
5920 }
5921
5922 /* Function vect_create_epilog_for_reduction
5923
5924    Create code at the loop-epilog to finalize the result of a reduction
5925    computation.
5926
5927    STMT_INFO is the scalar reduction stmt that is being vectorized.
5928    SLP_NODE is an SLP node containing a group of reduction statements. The
5929      first one in this group is STMT_INFO.
5930    SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5931    REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5932      (counting from 0)
5933    LOOP_EXIT is the edge to update in the merge block.  In the case of a single
5934      exit this edge is always the main loop exit.
5935
5936    This function:
5937    1. Completes the reduction def-use cycles.
5938    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5939       by calling the function specified by REDUC_FN if available, or by
5940       other means (whole-vector shifts or a scalar loop).
5941       The function also creates a new phi node at the loop exit to preserve
5942       loop-closed form, as illustrated below.
5943
5944      The flow at the entry to this function:
5945
5946         loop:
5947           vec_def = phi <vec_init, null>        # REDUCTION_PHI
5948           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5949           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5950         loop_exit:
5951           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5952           use <s_out0>
5953           use <s_out0>
5954
5955      The above is transformed by this function into:
5956
5957         loop:
5958           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
5959           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5960           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5961         loop_exit:
5962           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5963           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5964           v_out2 = reduce <v_out1>
5965           s_out3 = extract_field <v_out2, 0>
5966           s_out4 = adjust_result <s_out3>
5967           use <s_out4>
5968           use <s_out4>
5969 */
5970
5971 static void
5972 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5973                                   stmt_vec_info stmt_info,
5974                                   slp_tree slp_node,
5975                                   slp_instance slp_node_instance,
5976                                   edge loop_exit)
5977 {
5978   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5979   gcc_assert (reduc_info->is_reduc_info);
5980   /* For double reductions we need to get at the inner loop reduction
5981      stmt which has the meta info attached.  Our stmt_info is that of the
5982      loop-closed PHI of the inner loop which we remember as
5983      def for the reduction PHI generation.  */
5984   bool double_reduc = false;
5985   bool main_exit_p = LOOP_VINFO_IV_EXIT (loop_vinfo) == loop_exit;
5986   stmt_vec_info rdef_info = stmt_info;
5987   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5988     {
5989       gcc_assert (!slp_node);
5990       double_reduc = true;
5991       stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5992                                             (stmt_info->stmt, 0));
5993       stmt_info = vect_stmt_to_vectorize (stmt_info);
5994     }
5995   gphi *reduc_def_stmt
5996     = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
5997   code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5998   internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
5999   tree vectype;
6000   machine_mode mode;
6001   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
6002   basic_block exit_bb;
6003   tree scalar_dest;
6004   tree scalar_type;
6005   gimple *new_phi = NULL, *phi = NULL;
6006   gimple_stmt_iterator exit_gsi;
6007   tree new_temp = NULL_TREE, new_name, new_scalar_dest;
6008   gimple *epilog_stmt = NULL;
6009   gimple *exit_phi;
6010   tree bitsize;
6011   tree def;
6012   tree orig_name, scalar_result;
6013   imm_use_iterator imm_iter, phi_imm_iter;
6014   use_operand_p use_p, phi_use_p;
6015   gimple *use_stmt;
6016   auto_vec<tree> reduc_inputs;
6017   int j, i;
6018   vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
6019   unsigned int group_size = 1, k;
6020   auto_vec<gimple *> phis;
6021   /* SLP reduction without reduction chain, e.g.,
6022      # a1 = phi <a2, a0>
6023      # b1 = phi <b2, b0>
6024      a2 = operation (a1)
6025      b2 = operation (b1)  */
6026   bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
6027   bool direct_slp_reduc;
6028   tree induction_index = NULL_TREE;
6029
6030   if (slp_node)
6031     group_size = SLP_TREE_LANES (slp_node);
6032
6033   if (nested_in_vect_loop_p (loop, stmt_info))
6034     {
6035       outer_loop = loop;
6036       loop = loop->inner;
6037       gcc_assert (!slp_node && double_reduc);
6038     }
6039
6040   vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
6041   gcc_assert (vectype);
6042   mode = TYPE_MODE (vectype);
6043
6044   tree induc_val = NULL_TREE;
6045   tree adjustment_def = NULL;
6046   if (slp_node)
6047     ;
6048   else
6049     {
6050       /* Optimize: for induction condition reduction, if we can't use zero
6051          for induc_val, use initial_def.  */
6052       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6053         induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
6054       else if (double_reduc)
6055         ;
6056       else
6057         adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
6058     }
6059
6060   stmt_vec_info single_live_out_stmt[] = { stmt_info };
6061   array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
6062   if (slp_reduc)
6063     /* All statements produce live-out values.  */
6064     live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
6065   else if (slp_node)
6066     {
6067       /* The last statement in the reduction chain produces the live-out
6068          value.  Note SLP optimization can shuffle scalar stmts to
6069          optimize permutations so we have to search for the last stmt.  */
6070       for (k = 0; k < group_size; ++k)
6071         if (!REDUC_GROUP_NEXT_ELEMENT (SLP_TREE_SCALAR_STMTS (slp_node)[k]))
6072           {
6073             single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[k];
6074             break;
6075           }
6076     }
6077
6078   unsigned vec_num;
6079   int ncopies;
6080   if (slp_node)
6081     {
6082       vec_num = SLP_TREE_VEC_DEFS (slp_node_instance->reduc_phis).length ();
6083       ncopies = 1;
6084     }
6085   else
6086     {
6087       stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
6088       vec_num = 1;
6089       ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
6090     }
6091
6092   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
6093      which is updated with the current index of the loop for every match of
6094      the original loop's cond_expr (VEC_STMT).  This results in a vector
6095      containing the last time the condition passed for that vector lane.
6096      The first match will be a 1 to allow 0 to be used for non-matching
6097      indexes.  If there are no matches at all then the vector will be all
6098      zeroes.
6099
6100      PR92772: This algorithm is broken for architectures that support
6101      masked vectors, but do not provide fold_extract_last.  */
6102   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
6103     {
6104       auto_vec<std::pair<tree, bool>, 2> ccompares;
6105       stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
6106       cond_info = vect_stmt_to_vectorize (cond_info);
6107       while (cond_info != reduc_info)
6108         {
6109           if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
6110             {
6111               gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
6112               gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
6113               ccompares.safe_push
6114                 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
6115                                  STMT_VINFO_REDUC_IDX (cond_info) == 2));
6116             }
6117           cond_info
6118             = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
6119                                                  1 + STMT_VINFO_REDUC_IDX
6120                                                         (cond_info)));
6121           cond_info = vect_stmt_to_vectorize (cond_info);
6122         }
6123       gcc_assert (ccompares.length () != 0);
6124
6125       tree indx_before_incr, indx_after_incr;
6126       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
6127       int scalar_precision
6128         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
6129       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
6130       tree cr_index_vector_type = get_related_vectype_for_scalar_type
6131         (TYPE_MODE (vectype), cr_index_scalar_type,
6132          TYPE_VECTOR_SUBPARTS (vectype));
6133
6134       /* First we create a simple vector induction variable which starts
6135          with the values {1,2,3,...} (SERIES_VECT) and increments by the
6136          vector size (STEP).  */
6137
6138       /* Create a {1,2,3,...} vector.  */
6139       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
6140
6141       /* Create a vector of the step value.  */
6142       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
6143       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
6144
6145       /* Create an induction variable.  */
6146       gimple_stmt_iterator incr_gsi;
6147       bool insert_after;
6148       vect_iv_increment_position (loop_exit, &incr_gsi, &insert_after);
6149       create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, &incr_gsi,
6150                  insert_after, &indx_before_incr, &indx_after_incr);
6151
6152       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
6153          filled with zeros (VEC_ZERO).  */
6154
6155       /* Create a vector of 0s.  */
6156       tree zero = build_zero_cst (cr_index_scalar_type);
6157       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
6158
6159       /* Create a vector phi node.  */
6160       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
6161       new_phi = create_phi_node (new_phi_tree, loop->header);
6162       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
6163                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
6164
6165       /* Now take the condition from the loops original cond_exprs
6166          and produce a new cond_exprs (INDEX_COND_EXPR) which for
6167          every match uses values from the induction variable
6168          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
6169          (NEW_PHI_TREE).
6170          Finally, we update the phi (NEW_PHI_TREE) to take the value of
6171          the new cond_expr (INDEX_COND_EXPR).  */
6172       gimple_seq stmts = NULL;
6173       for (int i = ccompares.length () - 1; i != -1; --i)
6174         {
6175           tree ccompare = ccompares[i].first;
6176           if (ccompares[i].second)
6177             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
6178                                          cr_index_vector_type,
6179                                          ccompare,
6180                                          indx_before_incr, new_phi_tree);
6181           else
6182             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
6183                                          cr_index_vector_type,
6184                                          ccompare,
6185                                          new_phi_tree, indx_before_incr);
6186         }
6187       gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
6188
6189       /* Update the phi with the vec cond.  */
6190       induction_index = new_phi_tree;
6191       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
6192                    loop_latch_edge (loop), UNKNOWN_LOCATION);
6193     }
6194
6195   /* 2. Create epilog code.
6196         The reduction epilog code operates across the elements of the vector
6197         of partial results computed by the vectorized loop.
6198         The reduction epilog code consists of:
6199
6200         step 1: compute the scalar result in a vector (v_out2)
6201         step 2: extract the scalar result (s_out3) from the vector (v_out2)
6202         step 3: adjust the scalar result (s_out3) if needed.
6203
6204         Step 1 can be accomplished using one the following three schemes:
6205           (scheme 1) using reduc_fn, if available.
6206           (scheme 2) using whole-vector shifts, if available.
6207           (scheme 3) using a scalar loop. In this case steps 1+2 above are
6208                      combined.
6209
6210           The overall epilog code looks like this:
6211
6212           s_out0 = phi <s_loop>         # original EXIT_PHI
6213           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
6214           v_out2 = reduce <v_out1>              # step 1
6215           s_out3 = extract_field <v_out2, 0>    # step 2
6216           s_out4 = adjust_result <s_out3>       # step 3
6217
6218           (step 3 is optional, and steps 1 and 2 may be combined).
6219           Lastly, the uses of s_out0 are replaced by s_out4.  */
6220
6221
6222   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
6223          v_out1 = phi <VECT_DEF>
6224          Store them in NEW_PHIS.  */
6225   if (double_reduc)
6226     loop = outer_loop;
6227   /* We need to reduce values in all exits.  */
6228   exit_bb = loop_exit->dest;
6229   exit_gsi = gsi_after_labels (exit_bb);
6230   reduc_inputs.create (slp_node ? vec_num : ncopies);
6231   vec <gimple *> vec_stmts = vNULL;
6232   for (unsigned i = 0; i < vec_num; i++)
6233     {
6234       gimple_seq stmts = NULL;
6235       def = vect_get_vect_def (rdef_info, slp_node, slp_node_instance,
6236                                main_exit_p, i, vec_stmts);
6237       for (j = 0; j < ncopies; j++)
6238         {
6239           tree new_def = copy_ssa_name (def);
6240           phi = create_phi_node (new_def, exit_bb);
6241           if (j)
6242             def = gimple_get_lhs (vec_stmts[j]);
6243           if (LOOP_VINFO_IV_EXIT (loop_vinfo) == loop_exit)
6244             SET_PHI_ARG_DEF (phi, loop_exit->dest_idx, def);
6245           else
6246             {
6247               for (unsigned k = 0; k < gimple_phi_num_args (phi); k++)
6248                 SET_PHI_ARG_DEF (phi, k, def);
6249             }
6250           new_def = gimple_convert (&stmts, vectype, new_def);
6251           reduc_inputs.quick_push (new_def);
6252         }
6253       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6254     }
6255
6256   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
6257          (i.e. when reduc_fn is not available) and in the final adjustment
6258          code (if needed).  Also get the original scalar reduction variable as
6259          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
6260          represents a reduction pattern), the tree-code and scalar-def are
6261          taken from the original stmt that the pattern-stmt (STMT) replaces.
6262          Otherwise (it is a regular reduction) - the tree-code and scalar-def
6263          are taken from STMT.  */
6264
6265   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6266   if (orig_stmt_info != stmt_info)
6267     {
6268       /* Reduction pattern  */
6269       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6270       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
6271     }
6272
6273   scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
6274   scalar_type = TREE_TYPE (scalar_dest);
6275   scalar_results.truncate (0);
6276   scalar_results.reserve_exact (group_size);
6277   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
6278   bitsize = TYPE_SIZE (scalar_type);
6279
6280   /* True if we should implement SLP_REDUC using native reduction operations
6281      instead of scalar operations.  */
6282   direct_slp_reduc = (reduc_fn != IFN_LAST
6283                       && slp_reduc
6284                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
6285
6286   /* In case of reduction chain, e.g.,
6287      # a1 = phi <a3, a0>
6288      a2 = operation (a1)
6289      a3 = operation (a2),
6290
6291      we may end up with more than one vector result.  Here we reduce them
6292      to one vector.
6293
6294      The same is true for a SLP reduction, e.g.,
6295      # a1 = phi <a2, a0>
6296      # b1 = phi <b2, b0>
6297      a2 = operation (a1)
6298      b2 = operation (a2),
6299
6300      where we can end up with more than one vector as well.  We can
6301      easily accumulate vectors when the number of vector elements is
6302      a multiple of the SLP group size.
6303
6304      The same is true if we couldn't use a single defuse cycle.  */
6305   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6306       || direct_slp_reduc
6307       || (slp_reduc
6308           && constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype), group_size))
6309       || ncopies > 1)
6310     {
6311       gimple_seq stmts = NULL;
6312       tree single_input = reduc_inputs[0];
6313       for (k = 1; k < reduc_inputs.length (); k++)
6314         single_input = gimple_build (&stmts, code, vectype,
6315                                      single_input, reduc_inputs[k]);
6316       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6317
6318       reduc_inputs.truncate (0);
6319       reduc_inputs.safe_push (single_input);
6320     }
6321
6322   tree orig_reduc_input = reduc_inputs[0];
6323
6324   /* If this loop is an epilogue loop that can be skipped after the
6325      main loop, we can only share a reduction operation between the
6326      main loop and the epilogue if we put it at the target of the
6327      skip edge.
6328
6329      We can still reuse accumulators if this check fails.  Doing so has
6330      the minor(?) benefit of making the epilogue loop's scalar result
6331      independent of the main loop's scalar result.  */
6332   bool unify_with_main_loop_p = false;
6333   if (reduc_info->reused_accumulator
6334       && loop_vinfo->skip_this_loop_edge
6335       && single_succ_p (exit_bb)
6336       && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
6337     {
6338       unify_with_main_loop_p = true;
6339
6340       basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
6341       reduc_inputs[0] = make_ssa_name (vectype);
6342       gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
6343       add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
6344                    UNKNOWN_LOCATION);
6345       add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
6346                    loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
6347       exit_gsi = gsi_after_labels (reduc_block);
6348     }
6349
6350   /* Shouldn't be used beyond this point.  */
6351   exit_bb = nullptr;
6352
6353   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6354       && reduc_fn != IFN_LAST)
6355     {
6356       /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
6357          various data values where the condition matched and another vector
6358          (INDUCTION_INDEX) containing all the indexes of those matches.  We
6359          need to extract the last matching index (which will be the index with
6360          highest value) and use this to index into the data vector.
6361          For the case where there were no matches, the data vector will contain
6362          all default values and the index vector will be all zeros.  */
6363
6364       /* Get various versions of the type of the vector of indexes.  */
6365       tree index_vec_type = TREE_TYPE (induction_index);
6366       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
6367       tree index_scalar_type = TREE_TYPE (index_vec_type);
6368       tree index_vec_cmp_type = truth_type_for (index_vec_type);
6369
6370       /* Get an unsigned integer version of the type of the data vector.  */
6371       int scalar_precision
6372         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6373       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
6374       tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
6375                                                 vectype);
6376
6377       /* First we need to create a vector (ZERO_VEC) of zeros and another
6378          vector (MAX_INDEX_VEC) filled with the last matching index, which we
6379          can create using a MAX reduction and then expanding.
6380          In the case where the loop never made any matches, the max index will
6381          be zero.  */
6382
6383       /* Vector of {0, 0, 0,...}.  */
6384       tree zero_vec = build_zero_cst (vectype);
6385
6386       /* Find maximum value from the vector of found indexes.  */
6387       tree max_index = make_ssa_name (index_scalar_type);
6388       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6389                                                           1, induction_index);
6390       gimple_call_set_lhs (max_index_stmt, max_index);
6391       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
6392
6393       /* Vector of {max_index, max_index, max_index,...}.  */
6394       tree max_index_vec = make_ssa_name (index_vec_type);
6395       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
6396                                                       max_index);
6397       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
6398                                                         max_index_vec_rhs);
6399       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
6400
6401       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
6402          with the vector (INDUCTION_INDEX) of found indexes, choosing values
6403          from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
6404          otherwise.  Only one value should match, resulting in a vector
6405          (VEC_COND) with one data value and the rest zeros.
6406          In the case where the loop never made any matches, every index will
6407          match, resulting in a vector with all data values (which will all be
6408          the default value).  */
6409
6410       /* Compare the max index vector to the vector of found indexes to find
6411          the position of the max value.  */
6412       tree vec_compare = make_ssa_name (index_vec_cmp_type);
6413       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
6414                                                       induction_index,
6415                                                       max_index_vec);
6416       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
6417
6418       /* Use the compare to choose either values from the data vector or
6419          zero.  */
6420       tree vec_cond = make_ssa_name (vectype);
6421       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
6422                                                    vec_compare,
6423                                                    reduc_inputs[0],
6424                                                    zero_vec);
6425       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
6426
6427       /* Finally we need to extract the data value from the vector (VEC_COND)
6428          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
6429          reduction, but because this doesn't exist, we can use a MAX reduction
6430          instead.  The data value might be signed or a float so we need to cast
6431          it first.
6432          In the case where the loop never made any matches, the data values are
6433          all identical, and so will reduce down correctly.  */
6434
6435       /* Make the matched data values unsigned.  */
6436       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
6437       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
6438                                        vec_cond);
6439       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
6440                                                         VIEW_CONVERT_EXPR,
6441                                                         vec_cond_cast_rhs);
6442       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
6443
6444       /* Reduce down to a scalar value.  */
6445       tree data_reduc = make_ssa_name (scalar_type_unsigned);
6446       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6447                                                            1, vec_cond_cast);
6448       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
6449       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
6450
6451       /* Convert the reduced value back to the result type and set as the
6452          result.  */
6453       gimple_seq stmts = NULL;
6454       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
6455                                data_reduc);
6456       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6457       scalar_results.safe_push (new_temp);
6458     }
6459   else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6460            && reduc_fn == IFN_LAST)
6461     {
6462       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
6463          idx = 0;
6464          idx_val = induction_index[0];
6465          val = data_reduc[0];
6466          for (idx = 0, val = init, i = 0; i < nelts; ++i)
6467            if (induction_index[i] > idx_val)
6468              val = data_reduc[i], idx_val = induction_index[i];
6469          return val;  */
6470
6471       tree data_eltype = TREE_TYPE (vectype);
6472       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
6473       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
6474       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
6475       /* Enforced by vectorizable_reduction, which ensures we have target
6476          support before allowing a conditional reduction on variable-length
6477          vectors.  */
6478       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
6479       tree idx_val = NULL_TREE, val = NULL_TREE;
6480       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
6481         {
6482           tree old_idx_val = idx_val;
6483           tree old_val = val;
6484           idx_val = make_ssa_name (idx_eltype);
6485           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
6486                                              build3 (BIT_FIELD_REF, idx_eltype,
6487                                                      induction_index,
6488                                                      bitsize_int (el_size),
6489                                                      bitsize_int (off)));
6490           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6491           val = make_ssa_name (data_eltype);
6492           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
6493                                              build3 (BIT_FIELD_REF,
6494                                                      data_eltype,
6495                                                      reduc_inputs[0],
6496                                                      bitsize_int (el_size),
6497                                                      bitsize_int (off)));
6498           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6499           if (off != 0)
6500             {
6501               tree new_idx_val = idx_val;
6502               if (off != v_size - el_size)
6503                 {
6504                   new_idx_val = make_ssa_name (idx_eltype);
6505                   epilog_stmt = gimple_build_assign (new_idx_val,
6506                                                      MAX_EXPR, idx_val,
6507                                                      old_idx_val);
6508                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6509                 }
6510               tree cond = make_ssa_name (boolean_type_node);
6511               epilog_stmt = gimple_build_assign (cond, GT_EXPR,
6512                                                  idx_val, old_idx_val);
6513               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6514               tree new_val = make_ssa_name (data_eltype);
6515               epilog_stmt = gimple_build_assign (new_val, COND_EXPR,
6516                                                  cond, val, old_val);
6517               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6518               idx_val = new_idx_val;
6519               val = new_val;
6520             }
6521         }
6522       /* Convert the reduced value back to the result type and set as the
6523          result.  */
6524       gimple_seq stmts = NULL;
6525       val = gimple_convert (&stmts, scalar_type, val);
6526       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6527       scalar_results.safe_push (val);
6528     }
6529
6530   /* 2.3 Create the reduction code, using one of the three schemes described
6531          above. In SLP we simply need to extract all the elements from the
6532          vector (without reducing them), so we use scalar shifts.  */
6533   else if (reduc_fn != IFN_LAST && !slp_reduc)
6534     {
6535       tree tmp;
6536       tree vec_elem_type;
6537
6538       /* Case 1:  Create:
6539          v_out2 = reduc_expr <v_out1>  */
6540
6541       if (dump_enabled_p ())
6542         dump_printf_loc (MSG_NOTE, vect_location,
6543                          "Reduce using direct vector reduction.\n");
6544
6545       gimple_seq stmts = NULL;
6546       vec_elem_type = TREE_TYPE (vectype);
6547       new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
6548                                vec_elem_type, reduc_inputs[0]);
6549       new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6550       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6551
6552       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6553           && induc_val)
6554         {
6555           /* Earlier we set the initial value to be a vector if induc_val
6556              values.  Check the result and if it is induc_val then replace
6557              with the original initial value, unless induc_val is
6558              the same as initial_def already.  */
6559           tree zcompare = make_ssa_name (boolean_type_node);
6560           epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
6561                                              new_temp, induc_val);
6562           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6563           tree initial_def = reduc_info->reduc_initial_values[0];
6564           tmp = make_ssa_name (new_scalar_dest);
6565           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6566                                              initial_def, new_temp);
6567           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6568           new_temp = tmp;
6569         }
6570
6571       scalar_results.safe_push (new_temp);
6572     }
6573   else if (direct_slp_reduc)
6574     {
6575       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
6576          with the elements for other SLP statements replaced with the
6577          neutral value.  We can then do a normal reduction on each vector.  */
6578
6579       /* Enforced by vectorizable_reduction.  */
6580       gcc_assert (reduc_inputs.length () == 1);
6581       gcc_assert (pow2p_hwi (group_size));
6582
6583       gimple_seq seq = NULL;
6584
6585       /* Build a vector {0, 1, 2, ...}, with the same number of elements
6586          and the same element size as VECTYPE.  */
6587       tree index = build_index_vector (vectype, 0, 1);
6588       tree index_type = TREE_TYPE (index);
6589       tree index_elt_type = TREE_TYPE (index_type);
6590       tree mask_type = truth_type_for (index_type);
6591
6592       /* Create a vector that, for each element, identifies which of
6593          the REDUC_GROUP_SIZE results should use it.  */
6594       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
6595       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
6596                             build_vector_from_val (index_type, index_mask));
6597
6598       /* Get a neutral vector value.  This is simply a splat of the neutral
6599          scalar value if we have one, otherwise the initial scalar value
6600          is itself a neutral value.  */
6601       tree vector_identity = NULL_TREE;
6602       tree neutral_op = NULL_TREE;
6603       if (slp_node)
6604         {
6605           tree initial_value = NULL_TREE;
6606           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6607             initial_value = reduc_info->reduc_initial_values[0];
6608           neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
6609                                                  initial_value, false);
6610         }
6611       if (neutral_op)
6612         vector_identity = gimple_build_vector_from_val (&seq, vectype,
6613                                                         neutral_op);
6614       for (unsigned int i = 0; i < group_size; ++i)
6615         {
6616           /* If there's no univeral neutral value, we can use the
6617              initial scalar value from the original PHI.  This is used
6618              for MIN and MAX reduction, for example.  */
6619           if (!neutral_op)
6620             {
6621               tree scalar_value = reduc_info->reduc_initial_values[i];
6622               scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
6623                                              scalar_value);
6624               vector_identity = gimple_build_vector_from_val (&seq, vectype,
6625                                                               scalar_value);
6626             }
6627
6628           /* Calculate the equivalent of:
6629
6630              sel[j] = (index[j] == i);
6631
6632              which selects the elements of REDUC_INPUTS[0] that should
6633              be included in the result.  */
6634           tree compare_val = build_int_cst (index_elt_type, i);
6635           compare_val = build_vector_from_val (index_type, compare_val);
6636           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
6637                                    index, compare_val);
6638
6639           /* Calculate the equivalent of:
6640
6641              vec = seq ? reduc_inputs[0] : vector_identity;
6642
6643              VEC is now suitable for a full vector reduction.  */
6644           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
6645                                    sel, reduc_inputs[0], vector_identity);
6646
6647           /* Do the reduction and convert it to the appropriate type.  */
6648           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
6649                                       TREE_TYPE (vectype), vec);
6650           scalar = gimple_convert (&seq, scalar_type, scalar);
6651           scalar_results.safe_push (scalar);
6652         }
6653       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
6654     }
6655   else
6656     {
6657       bool reduce_with_shift;
6658       tree vec_temp;
6659
6660       gcc_assert (slp_reduc || reduc_inputs.length () == 1);
6661
6662       /* See if the target wants to do the final (shift) reduction
6663          in a vector mode of smaller size and first reduce upper/lower
6664          halves against each other.  */
6665       enum machine_mode mode1 = mode;
6666       tree stype = TREE_TYPE (vectype);
6667       unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6668       unsigned nunits1 = nunits;
6669       if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
6670           && reduc_inputs.length () == 1)
6671         {
6672           nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6673           /* For SLP reductions we have to make sure lanes match up, but
6674              since we're doing individual element final reduction reducing
6675              vector width here is even more important.
6676              ???  We can also separate lanes with permutes, for the common
6677              case of power-of-two group-size odd/even extracts would work.  */
6678           if (slp_reduc && nunits != nunits1)
6679             {
6680               nunits1 = least_common_multiple (nunits1, group_size);
6681               gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
6682             }
6683         }
6684       if (!slp_reduc
6685           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
6686         nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6687
6688       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
6689                                                            stype, nunits1);
6690       reduce_with_shift = have_whole_vector_shift (mode1);
6691       if (!VECTOR_MODE_P (mode1)
6692           || !directly_supported_p (code, vectype1))
6693         reduce_with_shift = false;
6694
6695       /* First reduce the vector to the desired vector size we should
6696          do shift reduction on by combining upper and lower halves.  */
6697       gimple_seq stmts = NULL;
6698       new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
6699                                              code, &stmts);
6700       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6701       reduc_inputs[0] = new_temp;
6702
6703       if (reduce_with_shift && !slp_reduc)
6704         {
6705           int element_bitsize = tree_to_uhwi (bitsize);
6706           /* Enforced by vectorizable_reduction, which disallows SLP reductions
6707              for variable-length vectors and also requires direct target support
6708              for loop reductions.  */
6709           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6710           int nelements = vec_size_in_bits / element_bitsize;
6711           vec_perm_builder sel;
6712           vec_perm_indices indices;
6713
6714           int elt_offset;
6715
6716           tree zero_vec = build_zero_cst (vectype1);
6717           /* Case 2: Create:
6718              for (offset = nelements/2; offset >= 1; offset/=2)
6719                 {
6720                   Create:  va' = vec_shift <va, offset>
6721                   Create:  va = vop <va, va'>
6722                 }  */
6723
6724           tree rhs;
6725
6726           if (dump_enabled_p ())
6727             dump_printf_loc (MSG_NOTE, vect_location,
6728                              "Reduce using vector shifts\n");
6729
6730           gimple_seq stmts = NULL;
6731           new_temp = gimple_convert (&stmts, vectype1, new_temp);
6732           for (elt_offset = nelements / 2;
6733                elt_offset >= 1;
6734                elt_offset /= 2)
6735             {
6736               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
6737               indices.new_vector (sel, 2, nelements);
6738               tree mask = vect_gen_perm_mask_any (vectype1, indices);
6739               new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
6740                                        new_temp, zero_vec, mask);
6741               new_temp = gimple_build (&stmts, code,
6742                                        vectype1, new_name, new_temp);
6743             }
6744           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6745
6746           /* 2.4  Extract the final scalar result.  Create:
6747              s_out3 = extract_field <v_out2, bitpos>  */
6748
6749           if (dump_enabled_p ())
6750             dump_printf_loc (MSG_NOTE, vect_location,
6751                              "extract scalar result\n");
6752
6753           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
6754                         bitsize, bitsize_zero_node);
6755           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
6756           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
6757           gimple_assign_set_lhs (epilog_stmt, new_temp);
6758           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6759           scalar_results.safe_push (new_temp);
6760         }
6761       else
6762         {
6763           /* Case 3: Create:
6764              s = extract_field <v_out2, 0>
6765              for (offset = element_size;
6766                   offset < vector_size;
6767                   offset += element_size;)
6768                {
6769                  Create:  s' = extract_field <v_out2, offset>
6770                  Create:  s = op <s, s'>  // For non SLP cases
6771                }  */
6772
6773           if (dump_enabled_p ())
6774             dump_printf_loc (MSG_NOTE, vect_location,
6775                              "Reduce using scalar code.\n");
6776
6777           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6778           int element_bitsize = tree_to_uhwi (bitsize);
6779           tree compute_type = TREE_TYPE (vectype);
6780           gimple_seq stmts = NULL;
6781           FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
6782             {
6783               int bit_offset;
6784               new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
6785                                        vec_temp, bitsize, bitsize_zero_node);
6786
6787               /* In SLP we don't need to apply reduction operation, so we just
6788                  collect s' values in SCALAR_RESULTS.  */
6789               if (slp_reduc)
6790                 scalar_results.safe_push (new_temp);
6791
6792               for (bit_offset = element_bitsize;
6793                    bit_offset < vec_size_in_bits;
6794                    bit_offset += element_bitsize)
6795                 {
6796                   tree bitpos = bitsize_int (bit_offset);
6797                   new_name = gimple_build (&stmts, BIT_FIELD_REF,
6798                                            compute_type, vec_temp,
6799                                            bitsize, bitpos);
6800                   if (slp_reduc)
6801                     {
6802                       /* In SLP we don't need to apply reduction operation, so
6803                          we just collect s' values in SCALAR_RESULTS.  */
6804                       new_temp = new_name;
6805                       scalar_results.safe_push (new_name);
6806                     }
6807                   else
6808                     new_temp = gimple_build (&stmts, code, compute_type,
6809                                              new_name, new_temp);
6810                 }
6811             }
6812
6813           /* The only case where we need to reduce scalar results in SLP, is
6814              unrolling.  If the size of SCALAR_RESULTS is greater than
6815              REDUC_GROUP_SIZE, we reduce them combining elements modulo
6816              REDUC_GROUP_SIZE.  */
6817           if (slp_reduc)
6818             {
6819               tree res, first_res, new_res;
6820
6821               /* Reduce multiple scalar results in case of SLP unrolling.  */
6822               for (j = group_size; scalar_results.iterate (j, &res);
6823                    j++)
6824                 {
6825                   first_res = scalar_results[j % group_size];
6826                   new_res = gimple_build (&stmts, code, compute_type,
6827                                           first_res, res);
6828                   scalar_results[j % group_size] = new_res;
6829                 }
6830               scalar_results.truncate (group_size);
6831               for (k = 0; k < group_size; k++)
6832                 scalar_results[k] = gimple_convert (&stmts, scalar_type,
6833                                                     scalar_results[k]);
6834             }
6835           else
6836             {
6837               /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
6838               new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6839               scalar_results.safe_push (new_temp);
6840             }
6841
6842           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6843         }
6844
6845       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6846           && induc_val)
6847         {
6848           /* Earlier we set the initial value to be a vector if induc_val
6849              values.  Check the result and if it is induc_val then replace
6850              with the original initial value, unless induc_val is
6851              the same as initial_def already.  */
6852           tree zcompare = make_ssa_name (boolean_type_node);
6853           epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR, new_temp,
6854                                              induc_val);
6855           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6856           tree initial_def = reduc_info->reduc_initial_values[0];
6857           tree tmp = make_ssa_name (new_scalar_dest);
6858           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6859                                              initial_def, new_temp);
6860           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6861           scalar_results[0] = tmp;
6862         }
6863     }
6864
6865   /* 2.5 Adjust the final result by the initial value of the reduction
6866          variable. (When such adjustment is not needed, then
6867          'adjustment_def' is zero).  For example, if code is PLUS we create:
6868          new_temp = loop_exit_def + adjustment_def  */
6869
6870   if (adjustment_def)
6871     {
6872       gcc_assert (!slp_reduc);
6873       gimple_seq stmts = NULL;
6874       if (double_reduc)
6875         {
6876           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6877           adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6878           new_temp = gimple_build (&stmts, code, vectype,
6879                                    reduc_inputs[0], adjustment_def);
6880         }
6881       else
6882         {
6883           new_temp = scalar_results[0];
6884           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6885           adjustment_def = gimple_convert (&stmts, TREE_TYPE (vectype),
6886                                            adjustment_def);
6887           new_temp = gimple_convert (&stmts, TREE_TYPE (vectype), new_temp);
6888           new_temp = gimple_build (&stmts, code, TREE_TYPE (vectype),
6889                                    new_temp, adjustment_def);
6890           new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6891         }
6892
6893       epilog_stmt = gimple_seq_last_stmt (stmts);
6894       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6895       scalar_results[0] = new_temp;
6896     }
6897
6898   /* Record this operation if it could be reused by the epilogue loop.  */
6899   if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION
6900       && reduc_inputs.length () == 1)
6901     loop_vinfo->reusable_accumulators.put (scalar_results[0],
6902                                            { orig_reduc_input, reduc_info });
6903
6904   if (double_reduc)
6905     loop = outer_loop;
6906
6907   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
6908           phis with new adjusted scalar results, i.e., replace use <s_out0>
6909           with use <s_out4>.
6910
6911      Transform:
6912         loop_exit:
6913           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6914           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6915           v_out2 = reduce <v_out1>
6916           s_out3 = extract_field <v_out2, 0>
6917           s_out4 = adjust_result <s_out3>
6918           use <s_out0>
6919           use <s_out0>
6920
6921      into:
6922
6923         loop_exit:
6924           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6925           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6926           v_out2 = reduce <v_out1>
6927           s_out3 = extract_field <v_out2, 0>
6928           s_out4 = adjust_result <s_out3>
6929           use <s_out4>
6930           use <s_out4> */
6931
6932   gcc_assert (live_out_stmts.size () == scalar_results.length ());
6933   for (k = 0; k < live_out_stmts.size (); k++)
6934     {
6935       stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6936       scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6937
6938       phis.create (3);
6939       /* Find the loop-closed-use at the loop exit of the original scalar
6940          result.  (The reduction result is expected to have two immediate uses,
6941          one at the latch block, and one at the loop exit).  For double
6942          reductions we are looking for exit phis of the outer loop.  */
6943       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6944         {
6945           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6946             {
6947               if (!is_gimple_debug (USE_STMT (use_p)))
6948                 phis.safe_push (USE_STMT (use_p));
6949             }
6950           else
6951             {
6952               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6953                 {
6954                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
6955
6956                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6957                     {
6958                       if (!flow_bb_inside_loop_p (loop,
6959                                              gimple_bb (USE_STMT (phi_use_p)))
6960                           && !is_gimple_debug (USE_STMT (phi_use_p)))
6961                         phis.safe_push (USE_STMT (phi_use_p));
6962                     }
6963                 }
6964             }
6965         }
6966
6967       FOR_EACH_VEC_ELT (phis, i, exit_phi)
6968         {
6969           /* Replace the uses:  */
6970           orig_name = PHI_RESULT (exit_phi);
6971
6972           /* Look for a single use at the target of the skip edge.  */
6973           if (unify_with_main_loop_p)
6974             {
6975               use_operand_p use_p;
6976               gimple *user;
6977               if (!single_imm_use (orig_name, &use_p, &user))
6978                 gcc_unreachable ();
6979               orig_name = gimple_get_lhs (user);
6980             }
6981
6982           scalar_result = scalar_results[k];
6983           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6984             {
6985               FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6986                 SET_USE (use_p, scalar_result);
6987               update_stmt (use_stmt);
6988             }
6989         }
6990
6991       phis.release ();
6992     }
6993 }
6994
6995 /* Return a vector of type VECTYPE that is equal to the vector select
6996    operation "MASK ? VEC : IDENTITY".  Insert the select statements
6997    before GSI.  */
6998
6999 static tree
7000 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
7001                      tree vec, tree identity)
7002 {
7003   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
7004   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
7005                                           mask, vec, identity);
7006   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
7007   return cond;
7008 }
7009
7010 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
7011    order, starting with LHS.  Insert the extraction statements before GSI and
7012    associate the new scalar SSA names with variable SCALAR_DEST.
7013    If MASK is nonzero mask the input and then operate on it unconditionally.
7014    Return the SSA name for the result.  */
7015
7016 static tree
7017 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
7018                        tree_code code, tree lhs, tree vector_rhs,
7019                        tree mask)
7020 {
7021   tree vectype = TREE_TYPE (vector_rhs);
7022   tree scalar_type = TREE_TYPE (vectype);
7023   tree bitsize = TYPE_SIZE (scalar_type);
7024   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
7025   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
7026
7027   /* Re-create a VEC_COND_EXPR to mask the input here in order to be able
7028      to perform an unconditional element-wise reduction of it.  */
7029   if (mask)
7030     {
7031       tree masked_vector_rhs = make_temp_ssa_name (vectype, NULL,
7032                                                    "masked_vector_rhs");
7033       tree neutral_op = neutral_op_for_reduction (scalar_type, code, NULL_TREE,
7034                                                   false);
7035       tree vector_identity = build_vector_from_val (vectype, neutral_op);
7036       gassign *select = gimple_build_assign (masked_vector_rhs, VEC_COND_EXPR,
7037                                              mask, vector_rhs, vector_identity);
7038       gsi_insert_before (gsi, select, GSI_SAME_STMT);
7039       vector_rhs = masked_vector_rhs;
7040     }
7041
7042   for (unsigned HOST_WIDE_INT bit_offset = 0;
7043        bit_offset < vec_size_in_bits;
7044        bit_offset += element_bitsize)
7045     {
7046       tree bitpos = bitsize_int (bit_offset);
7047       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
7048                          bitsize, bitpos);
7049
7050       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
7051       rhs = make_ssa_name (scalar_dest, stmt);
7052       gimple_assign_set_lhs (stmt, rhs);
7053       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
7054
7055       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
7056       tree new_name = make_ssa_name (scalar_dest, stmt);
7057       gimple_assign_set_lhs (stmt, new_name);
7058       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
7059       lhs = new_name;
7060     }
7061   return lhs;
7062 }
7063
7064 /* Get a masked internal function equivalent to REDUC_FN.  VECTYPE_IN is the
7065    type of the vector input.  */
7066
7067 static internal_fn
7068 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
7069 {
7070   internal_fn mask_reduc_fn;
7071   internal_fn mask_len_reduc_fn;
7072
7073   switch (reduc_fn)
7074     {
7075     case IFN_FOLD_LEFT_PLUS:
7076       mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
7077       mask_len_reduc_fn = IFN_MASK_LEN_FOLD_LEFT_PLUS;
7078       break;
7079
7080     default:
7081       return IFN_LAST;
7082     }
7083
7084   if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
7085                                       OPTIMIZE_FOR_SPEED))
7086     return mask_reduc_fn;
7087   if (direct_internal_fn_supported_p (mask_len_reduc_fn, vectype_in,
7088                                       OPTIMIZE_FOR_SPEED))
7089     return mask_len_reduc_fn;
7090   return IFN_LAST;
7091 }
7092
7093 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
7094    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
7095    statement.  CODE is the operation performed by STMT_INFO and OPS are
7096    its scalar operands.  REDUC_INDEX is the index of the operand in
7097    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
7098    implements in-order reduction, or IFN_LAST if we should open-code it.
7099    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
7100    that should be used to control the operation in a fully-masked loop.  */
7101
7102 static bool
7103 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
7104                                stmt_vec_info stmt_info,
7105                                gimple_stmt_iterator *gsi,
7106                                gimple **vec_stmt, slp_tree slp_node,
7107                                gimple *reduc_def_stmt,
7108                                code_helper code, internal_fn reduc_fn,
7109                                tree *ops, int num_ops, tree vectype_in,
7110                                int reduc_index, vec_loop_masks *masks,
7111                                vec_loop_lens *lens)
7112 {
7113   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7114   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7115   internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
7116
7117   int ncopies;
7118   if (slp_node)
7119     ncopies = 1;
7120   else
7121     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7122
7123   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
7124   gcc_assert (ncopies == 1);
7125
7126   bool is_cond_op = false;
7127   if (!code.is_tree_code ())
7128     {
7129       code = conditional_internal_fn_code (internal_fn (code));
7130       gcc_assert (code != ERROR_MARK);
7131       is_cond_op = true;
7132     }
7133
7134   gcc_assert (TREE_CODE_LENGTH (tree_code (code)) == binary_op);
7135
7136   if (slp_node)
7137     {
7138       if (is_cond_op)
7139         {
7140           if (dump_enabled_p ())
7141             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7142                              "fold-left reduction on SLP not supported.\n");
7143           return false;
7144         }
7145
7146       gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
7147                             TYPE_VECTOR_SUBPARTS (vectype_in)));
7148     }
7149
7150   /* The operands either come from a binary operation or an IFN_COND operation.
7151      The former is a gimple assign with binary rhs and the latter is a
7152      gimple call with four arguments.  */
7153   gcc_assert (num_ops == 2 || num_ops == 4);
7154   tree op0, opmask;
7155   if (!is_cond_op)
7156     op0 = ops[1 - reduc_index];
7157   else
7158     {
7159       op0 = ops[2 + (1 - reduc_index)];
7160       opmask = ops[0];
7161       gcc_assert (!slp_node);
7162     }
7163
7164   int group_size = 1;
7165   stmt_vec_info scalar_dest_def_info;
7166   auto_vec<tree> vec_oprnds0, vec_opmask;
7167   if (slp_node)
7168     {
7169       auto_vec<vec<tree> > vec_defs (2);
7170       vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
7171       vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
7172       vec_defs[0].release ();
7173       vec_defs[1].release ();
7174       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7175       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
7176     }
7177   else
7178     {
7179       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7180                                      op0, &vec_oprnds0);
7181       scalar_dest_def_info = stmt_info;
7182
7183       /* For an IFN_COND_OP we also need the vector mask operand.  */
7184       if (is_cond_op)
7185           vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7186                                          opmask, &vec_opmask);
7187     }
7188
7189   gimple *sdef = vect_orig_stmt (scalar_dest_def_info)->stmt;
7190   tree scalar_dest = gimple_get_lhs (sdef);
7191   tree scalar_type = TREE_TYPE (scalar_dest);
7192   tree reduc_var = gimple_phi_result (reduc_def_stmt);
7193
7194   int vec_num = vec_oprnds0.length ();
7195   gcc_assert (vec_num == 1 || slp_node);
7196   tree vec_elem_type = TREE_TYPE (vectype_out);
7197   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
7198
7199   tree vector_identity = NULL_TREE;
7200   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7201     {
7202       vector_identity = build_zero_cst (vectype_out);
7203       if (!HONOR_SIGNED_ZEROS (vectype_out))
7204         ;
7205       else
7206         {
7207           gcc_assert (!HONOR_SIGN_DEPENDENT_ROUNDING (vectype_out));
7208           vector_identity = const_unop (NEGATE_EXPR, vectype_out,
7209                                         vector_identity);
7210         }
7211     }
7212
7213   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
7214   int i;
7215   tree def0;
7216   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7217     {
7218       gimple *new_stmt;
7219       tree mask = NULL_TREE;
7220       tree len = NULL_TREE;
7221       tree bias = NULL_TREE;
7222       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7223         mask = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num, vectype_in, i);
7224       else if (is_cond_op)
7225         mask = vec_opmask[0];
7226       if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
7227         {
7228           len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
7229                                    i, 1);
7230           signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
7231           bias = build_int_cst (intQI_type_node, biasval);
7232           if (!is_cond_op)
7233             mask = build_minus_one_cst (truth_type_for (vectype_in));
7234         }
7235
7236       /* Handle MINUS by adding the negative.  */
7237       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
7238         {
7239           tree negated = make_ssa_name (vectype_out);
7240           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
7241           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
7242           def0 = negated;
7243         }
7244
7245       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
7246           && mask && mask_reduc_fn == IFN_LAST)
7247         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
7248                                     vector_identity);
7249
7250       /* On the first iteration the input is simply the scalar phi
7251          result, and for subsequent iterations it is the output of
7252          the preceding operation.  */
7253       if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
7254         {
7255           if (mask && len && mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
7256             new_stmt = gimple_build_call_internal (mask_reduc_fn, 5, reduc_var,
7257                                                    def0, mask, len, bias);
7258           else if (mask && mask_reduc_fn == IFN_MASK_FOLD_LEFT_PLUS)
7259             new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
7260                                                    def0, mask);
7261           else
7262             new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
7263                                                    def0);
7264           /* For chained SLP reductions the output of the previous reduction
7265              operation serves as the input of the next. For the final statement
7266              the output cannot be a temporary - we reuse the original
7267              scalar destination of the last statement.  */
7268           if (i != vec_num - 1)
7269             {
7270               gimple_set_lhs (new_stmt, scalar_dest_var);
7271               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
7272               gimple_set_lhs (new_stmt, reduc_var);
7273             }
7274         }
7275       else
7276         {
7277           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var,
7278                                              tree_code (code), reduc_var, def0,
7279                                              mask);
7280           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
7281           /* Remove the statement, so that we can use the same code paths
7282              as for statements that we've just created.  */
7283           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
7284           gsi_remove (&tmp_gsi, true);
7285         }
7286
7287       if (i == vec_num - 1)
7288         {
7289           gimple_set_lhs (new_stmt, scalar_dest);
7290           vect_finish_replace_stmt (loop_vinfo,
7291                                     scalar_dest_def_info,
7292                                     new_stmt);
7293         }
7294       else
7295         vect_finish_stmt_generation (loop_vinfo,
7296                                      scalar_dest_def_info,
7297                                      new_stmt, gsi);
7298
7299       if (slp_node)
7300         slp_node->push_vec_def (new_stmt);
7301       else
7302         {
7303           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7304           *vec_stmt = new_stmt;
7305         }
7306     }
7307
7308   return true;
7309 }
7310
7311 /* Function is_nonwrapping_integer_induction.
7312
7313    Check if STMT_VINO (which is part of loop LOOP) both increments and
7314    does not cause overflow.  */
7315
7316 static bool
7317 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
7318 {
7319   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
7320   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
7321   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
7322   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
7323   widest_int ni, max_loop_value, lhs_max;
7324   wi::overflow_type overflow = wi::OVF_NONE;
7325
7326   /* Make sure the loop is integer based.  */
7327   if (TREE_CODE (base) != INTEGER_CST
7328       || TREE_CODE (step) != INTEGER_CST)
7329     return false;
7330
7331   /* Check that the max size of the loop will not wrap.  */
7332
7333   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
7334     return true;
7335
7336   if (! max_stmt_executions (loop, &ni))
7337     return false;
7338
7339   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
7340                             &overflow);
7341   if (overflow)
7342     return false;
7343
7344   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
7345                             TYPE_SIGN (lhs_type), &overflow);
7346   if (overflow)
7347     return false;
7348
7349   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
7350           <= TYPE_PRECISION (lhs_type));
7351 }
7352
7353 /* Check if masking can be supported by inserting a conditional expression.
7354    CODE is the code for the operation.  COND_FN is the conditional internal
7355    function, if it exists.  VECTYPE_IN is the type of the vector input.  */
7356 static bool
7357 use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
7358                          tree vectype_in)
7359 {
7360   if (cond_fn != IFN_LAST
7361       && direct_internal_fn_supported_p (cond_fn, vectype_in,
7362                                          OPTIMIZE_FOR_SPEED))
7363     return false;
7364
7365   if (code.is_tree_code ())
7366     switch (tree_code (code))
7367       {
7368       case DOT_PROD_EXPR:
7369       case SAD_EXPR:
7370         return true;
7371
7372       default:
7373         break;
7374       }
7375   return false;
7376 }
7377
7378 /* Insert a conditional expression to enable masked vectorization.  CODE is the
7379    code for the operation.  VOP is the array of operands.  MASK is the loop
7380    mask.  GSI is a statement iterator used to place the new conditional
7381    expression.  */
7382 static void
7383 build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
7384                       gimple_stmt_iterator *gsi)
7385 {
7386   switch (tree_code (code))
7387     {
7388     case DOT_PROD_EXPR:
7389       {
7390         tree vectype = TREE_TYPE (vop[1]);
7391         tree zero = build_zero_cst (vectype);
7392         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7393         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7394                                                mask, vop[1], zero);
7395         gsi_insert_before (gsi, select, GSI_SAME_STMT);
7396         vop[1] = masked_op1;
7397         break;
7398       }
7399
7400     case SAD_EXPR:
7401       {
7402         tree vectype = TREE_TYPE (vop[1]);
7403         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7404         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7405                                                mask, vop[1], vop[0]);
7406         gsi_insert_before (gsi, select, GSI_SAME_STMT);
7407         vop[1] = masked_op1;
7408         break;
7409       }
7410
7411     default:
7412       gcc_unreachable ();
7413     }
7414 }
7415
7416 /* Function vectorizable_reduction.
7417
7418    Check if STMT_INFO performs a reduction operation that can be vectorized.
7419    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
7420    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
7421    Return true if STMT_INFO is vectorizable in this way.
7422
7423    This function also handles reduction idioms (patterns) that have been
7424    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
7425    may be of this form:
7426      X = pattern_expr (arg0, arg1, ..., X)
7427    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
7428    sequence that had been detected and replaced by the pattern-stmt
7429    (STMT_INFO).
7430
7431    This function also handles reduction of condition expressions, for example:
7432      for (int i = 0; i < N; i++)
7433        if (a[i] < value)
7434          last = a[i];
7435    This is handled by vectorising the loop and creating an additional vector
7436    containing the loop indexes for which "a[i] < value" was true.  In the
7437    function epilogue this is reduced to a single max value and then used to
7438    index into the vector of results.
7439
7440    In some cases of reduction patterns, the type of the reduction variable X is
7441    different than the type of the other arguments of STMT_INFO.
7442    In such cases, the vectype that is used when transforming STMT_INFO into
7443    a vector stmt is different than the vectype that is used to determine the
7444    vectorization factor, because it consists of a different number of elements
7445    than the actual number of elements that are being operated upon in parallel.
7446
7447    For example, consider an accumulation of shorts into an int accumulator.
7448    On some targets it's possible to vectorize this pattern operating on 8
7449    shorts at a time (hence, the vectype for purposes of determining the
7450    vectorization factor should be V8HI); on the other hand, the vectype that
7451    is used to create the vector form is actually V4SI (the type of the result).
7452
7453    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
7454    indicates what is the actual level of parallelism (V8HI in the example), so
7455    that the right vectorization factor would be derived.  This vectype
7456    corresponds to the type of arguments to the reduction stmt, and should *NOT*
7457    be used to create the vectorized stmt.  The right vectype for the vectorized
7458    stmt is obtained from the type of the result X:
7459       get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7460
7461    This means that, contrary to "regular" reductions (or "regular" stmts in
7462    general), the following equation:
7463       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7464    does *NOT* necessarily hold for reduction patterns.  */
7465
7466 bool
7467 vectorizable_reduction (loop_vec_info loop_vinfo,
7468                         stmt_vec_info stmt_info, slp_tree slp_node,
7469                         slp_instance slp_node_instance,
7470                         stmt_vector_for_cost *cost_vec)
7471 {
7472   tree vectype_in = NULL_TREE;
7473   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7474   enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
7475   stmt_vec_info cond_stmt_vinfo = NULL;
7476   int i;
7477   int ncopies;
7478   bool single_defuse_cycle = false;
7479   bool nested_cycle = false;
7480   bool double_reduc = false;
7481   int vec_num;
7482   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
7483   tree cond_reduc_val = NULL_TREE;
7484
7485   /* Make sure it was already recognized as a reduction computation.  */
7486   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
7487       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
7488       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
7489     return false;
7490
7491   /* The stmt we store reduction analysis meta on.  */
7492   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7493   reduc_info->is_reduc_info = true;
7494
7495   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
7496     {
7497       if (is_a <gphi *> (stmt_info->stmt))
7498         {
7499           if (slp_node)
7500             {
7501               /* We eventually need to set a vector type on invariant
7502                  arguments.  */
7503               unsigned j;
7504               slp_tree child;
7505               FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
7506                 if (!vect_maybe_update_slp_op_vectype
7507                        (child, SLP_TREE_VECTYPE (slp_node)))
7508                   {
7509                     if (dump_enabled_p ())
7510                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7511                                        "incompatible vector types for "
7512                                        "invariants\n");
7513                     return false;
7514                   }
7515             }
7516           /* Analysis for double-reduction is done on the outer
7517              loop PHI, nested cycles have no further restrictions.  */
7518           STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
7519         }
7520       else
7521         STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7522       return true;
7523     }
7524
7525   stmt_vec_info orig_stmt_of_analysis = stmt_info;
7526   stmt_vec_info phi_info = stmt_info;
7527   if (!is_a <gphi *> (stmt_info->stmt))
7528     {
7529       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7530       return true;
7531     }
7532   if (slp_node)
7533     {
7534       slp_node_instance->reduc_phis = slp_node;
7535       /* ???  We're leaving slp_node to point to the PHIs, we only
7536          need it to get at the number of vector stmts which wasn't
7537          yet initialized for the instance root.  */
7538     }
7539   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7540     {
7541       use_operand_p use_p;
7542       gimple *use_stmt;
7543       bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
7544                                  &use_p, &use_stmt);
7545       gcc_assert (res);
7546       phi_info = loop_vinfo->lookup_stmt (use_stmt);
7547     }
7548
7549   /* PHIs should not participate in patterns.  */
7550   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7551   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7552
7553   /* Verify following REDUC_IDX from the latch def leads us back to the PHI
7554      and compute the reduction chain length.  Discover the real
7555      reduction operation stmt on the way (stmt_info and slp_for_stmt_info).  */
7556   tree reduc_def
7557     = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
7558                              loop_latch_edge
7559                                (gimple_bb (reduc_def_phi)->loop_father));
7560   unsigned reduc_chain_length = 0;
7561   bool only_slp_reduc_chain = true;
7562   stmt_info = NULL;
7563   slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
7564   while (reduc_def != PHI_RESULT (reduc_def_phi))
7565     {
7566       stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
7567       stmt_vec_info vdef = vect_stmt_to_vectorize (def);
7568       if (STMT_VINFO_REDUC_IDX (vdef) == -1)
7569         {
7570           if (dump_enabled_p ())
7571             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7572                              "reduction chain broken by patterns.\n");
7573           return false;
7574         }
7575       if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
7576         only_slp_reduc_chain = false;
7577       /* For epilogue generation live members of the chain need
7578          to point back to the PHI via their original stmt for
7579          info_for_reduction to work.  For SLP we need to look at
7580          all lanes here - even though we only will vectorize from
7581          the SLP node with live lane zero the other live lanes also
7582          need to be identified as part of a reduction to be able
7583          to skip code generation for them.  */
7584       if (slp_for_stmt_info)
7585         {
7586           for (auto s : SLP_TREE_SCALAR_STMTS (slp_for_stmt_info))
7587             if (STMT_VINFO_LIVE_P (s))
7588               STMT_VINFO_REDUC_DEF (vect_orig_stmt (s)) = phi_info;
7589         }
7590       else if (STMT_VINFO_LIVE_P (vdef))
7591         STMT_VINFO_REDUC_DEF (def) = phi_info;
7592       gimple_match_op op;
7593       if (!gimple_extract_op (vdef->stmt, &op))
7594         {
7595           if (dump_enabled_p ())
7596             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7597                              "reduction chain includes unsupported"
7598                              " statement type.\n");
7599           return false;
7600         }
7601       if (CONVERT_EXPR_CODE_P (op.code))
7602         {
7603           if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
7604             {
7605               if (dump_enabled_p ())
7606                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7607                                  "conversion in the reduction chain.\n");
7608               return false;
7609             }
7610         }
7611       else if (!stmt_info)
7612         /* First non-conversion stmt.  */
7613         stmt_info = vdef;
7614       reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
7615       reduc_chain_length++;
7616       if (!stmt_info && slp_node)
7617         slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
7618     }
7619   /* PHIs should not participate in patterns.  */
7620   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7621
7622   if (nested_in_vect_loop_p (loop, stmt_info))
7623     {
7624       loop = loop->inner;
7625       nested_cycle = true;
7626     }
7627
7628   /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
7629      element.  */
7630   if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7631     {
7632       gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
7633       stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
7634     }
7635   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7636     gcc_assert (slp_node
7637                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
7638
7639   /* 1. Is vectorizable reduction?  */
7640   /* Not supportable if the reduction variable is used in the loop, unless
7641      it's a reduction chain.  */
7642   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
7643       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7644     return false;
7645
7646   /* Reductions that are not used even in an enclosing outer-loop,
7647      are expected to be "live" (used out of the loop).  */
7648   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
7649       && !STMT_VINFO_LIVE_P (stmt_info))
7650     return false;
7651
7652   /* 2. Has this been recognized as a reduction pattern?
7653
7654      Check if STMT represents a pattern that has been recognized
7655      in earlier analysis stages.  For stmts that represent a pattern,
7656      the STMT_VINFO_RELATED_STMT field records the last stmt in
7657      the original sequence that constitutes the pattern.  */
7658
7659   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
7660   if (orig_stmt_info)
7661     {
7662       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
7663       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
7664     }
7665
7666   /* 3. Check the operands of the operation.  The first operands are defined
7667         inside the loop body. The last operand is the reduction variable,
7668         which is defined by the loop-header-phi.  */
7669
7670   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7671   STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
7672   gimple_match_op op;
7673   if (!gimple_extract_op (stmt_info->stmt, &op))
7674     gcc_unreachable ();
7675   bool lane_reduc_code_p = (op.code == DOT_PROD_EXPR
7676                             || op.code == WIDEN_SUM_EXPR
7677                             || op.code == SAD_EXPR);
7678
7679   if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
7680       && !SCALAR_FLOAT_TYPE_P (op.type))
7681     return false;
7682
7683   /* Do not try to vectorize bit-precision reductions.  */
7684   if (!type_has_mode_precision_p (op.type))
7685     return false;
7686
7687   /* For lane-reducing ops we're reducing the number of reduction PHIs
7688      which means the only use of that may be in the lane-reducing operation.  */
7689   if (lane_reduc_code_p
7690       && reduc_chain_length != 1
7691       && !only_slp_reduc_chain)
7692     {
7693       if (dump_enabled_p ())
7694         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7695                          "lane-reducing reduction with extra stmts.\n");
7696       return false;
7697     }
7698
7699   /* All uses but the last are expected to be defined in the loop.
7700      The last use is the reduction variable.  In case of nested cycle this
7701      assumption is not true: we use reduc_index to record the index of the
7702      reduction variable.  */
7703   slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
7704   tree *vectype_op = XALLOCAVEC (tree, op.num_ops);
7705   /* We need to skip an extra operand for COND_EXPRs with embedded
7706      comparison.  */
7707   unsigned opno_adjust = 0;
7708   if (op.code == COND_EXPR && COMPARISON_CLASS_P (op.ops[0]))
7709     opno_adjust = 1;
7710   for (i = 0; i < (int) op.num_ops; i++)
7711     {
7712       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
7713       if (i == 0 && op.code == COND_EXPR)
7714         continue;
7715
7716       stmt_vec_info def_stmt_info;
7717       enum vect_def_type dt;
7718       if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
7719                                i + opno_adjust, &op.ops[i], &slp_op[i], &dt,
7720                                &vectype_op[i], &def_stmt_info))
7721         {
7722           if (dump_enabled_p ())
7723             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7724                              "use not simple.\n");
7725           return false;
7726         }
7727       if (i == STMT_VINFO_REDUC_IDX (stmt_info))
7728         continue;
7729
7730       /* For an IFN_COND_OP we might hit the reduction definition operand
7731          twice (once as definition, once as else).  */
7732       if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
7733         continue;
7734
7735       /* There should be only one cycle def in the stmt, the one
7736          leading to reduc_def.  */
7737       if (VECTORIZABLE_CYCLE_DEF (dt))
7738         return false;
7739
7740       if (!vectype_op[i])
7741         vectype_op[i]
7742           = get_vectype_for_scalar_type (loop_vinfo,
7743                                          TREE_TYPE (op.ops[i]), slp_op[i]);
7744
7745       /* To properly compute ncopies we are interested in the widest
7746          non-reduction input type in case we're looking at a widening
7747          accumulation that we later handle in vect_transform_reduction.  */
7748       if (lane_reduc_code_p
7749           && vectype_op[i]
7750           && (!vectype_in
7751               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
7752                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_op[i]))))))
7753         vectype_in = vectype_op[i];
7754
7755       if (op.code == COND_EXPR)
7756         {
7757           /* Record how the non-reduction-def value of COND_EXPR is defined.  */
7758           if (dt == vect_constant_def)
7759             {
7760               cond_reduc_dt = dt;
7761               cond_reduc_val = op.ops[i];
7762             }
7763           if (dt == vect_induction_def
7764               && def_stmt_info
7765               && is_nonwrapping_integer_induction (def_stmt_info, loop))
7766             {
7767               cond_reduc_dt = dt;
7768               cond_stmt_vinfo = def_stmt_info;
7769             }
7770         }
7771     }
7772   if (!vectype_in)
7773     vectype_in = STMT_VINFO_VECTYPE (phi_info);
7774   STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
7775
7776   enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
7777   STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
7778   /* If we have a condition reduction, see if we can simplify it further.  */
7779   if (v_reduc_type == COND_REDUCTION)
7780     {
7781       if (slp_node)
7782         return false;
7783
7784       /* When the condition uses the reduction value in the condition, fail.  */
7785       if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
7786         {
7787           if (dump_enabled_p ())
7788             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7789                              "condition depends on previous iteration\n");
7790           return false;
7791         }
7792
7793       if (reduc_chain_length == 1
7794           && (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST, vectype_in,
7795                                               OPTIMIZE_FOR_SPEED)
7796               || direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
7797                                                  vectype_in,
7798                                                  OPTIMIZE_FOR_SPEED)))
7799         {
7800           if (dump_enabled_p ())
7801             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7802                              "optimizing condition reduction with"
7803                              " FOLD_EXTRACT_LAST.\n");
7804           STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
7805         }
7806       else if (cond_reduc_dt == vect_induction_def)
7807         {
7808           tree base
7809             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
7810           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
7811
7812           gcc_assert (TREE_CODE (base) == INTEGER_CST
7813                       && TREE_CODE (step) == INTEGER_CST);
7814           cond_reduc_val = NULL_TREE;
7815           enum tree_code cond_reduc_op_code = ERROR_MARK;
7816           tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
7817           if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
7818             ;
7819           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
7820              above base; punt if base is the minimum value of the type for
7821              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
7822           else if (tree_int_cst_sgn (step) == -1)
7823             {
7824               cond_reduc_op_code = MIN_EXPR;
7825               if (tree_int_cst_sgn (base) == -1)
7826                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7827               else if (tree_int_cst_lt (base,
7828                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
7829                 cond_reduc_val
7830                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
7831             }
7832           else
7833             {
7834               cond_reduc_op_code = MAX_EXPR;
7835               if (tree_int_cst_sgn (base) == 1)
7836                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7837               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
7838                                         base))
7839                 cond_reduc_val
7840                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
7841             }
7842           if (cond_reduc_val)
7843             {
7844               if (dump_enabled_p ())
7845                 dump_printf_loc (MSG_NOTE, vect_location,
7846                                  "condition expression based on "
7847                                  "integer induction.\n");
7848               STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
7849               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
7850                 = cond_reduc_val;
7851               STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
7852             }
7853         }
7854       else if (cond_reduc_dt == vect_constant_def)
7855         {
7856           enum vect_def_type cond_initial_dt;
7857           tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
7858           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
7859           if (cond_initial_dt == vect_constant_def
7860               && types_compatible_p (TREE_TYPE (cond_initial_val),
7861                                      TREE_TYPE (cond_reduc_val)))
7862             {
7863               tree e = fold_binary (LE_EXPR, boolean_type_node,
7864                                     cond_initial_val, cond_reduc_val);
7865               if (e && (integer_onep (e) || integer_zerop (e)))
7866                 {
7867                   if (dump_enabled_p ())
7868                     dump_printf_loc (MSG_NOTE, vect_location,
7869                                      "condition expression based on "
7870                                      "compile time constant.\n");
7871                   /* Record reduction code at analysis stage.  */
7872                   STMT_VINFO_REDUC_CODE (reduc_info)
7873                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
7874                   STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
7875                 }
7876             }
7877         }
7878     }
7879
7880   if (STMT_VINFO_LIVE_P (phi_info))
7881     return false;
7882
7883   if (slp_node)
7884     ncopies = 1;
7885   else
7886     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7887
7888   gcc_assert (ncopies >= 1);
7889
7890   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
7891
7892   if (nested_cycle)
7893     {
7894       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
7895                   == vect_double_reduction_def);
7896       double_reduc = true;
7897     }
7898
7899   /* 4.2. Check support for the epilog operation.
7900
7901           If STMT represents a reduction pattern, then the type of the
7902           reduction variable may be different than the type of the rest
7903           of the arguments.  For example, consider the case of accumulation
7904           of shorts into an int accumulator; The original code:
7905                         S1: int_a = (int) short_a;
7906           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
7907
7908           was replaced with:
7909                         STMT: int_acc = widen_sum <short_a, int_acc>
7910
7911           This means that:
7912           1. The tree-code that is used to create the vector operation in the
7913              epilog code (that reduces the partial results) is not the
7914              tree-code of STMT, but is rather the tree-code of the original
7915              stmt from the pattern that STMT is replacing.  I.e, in the example
7916              above we want to use 'widen_sum' in the loop, but 'plus' in the
7917              epilog.
7918           2. The type (mode) we use to check available target support
7919              for the vector operation to be created in the *epilog*, is
7920              determined by the type of the reduction variable (in the example
7921              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7922              However the type (mode) we use to check available target support
7923              for the vector operation to be created *inside the loop*, is
7924              determined by the type of the other arguments to STMT (in the
7925              example we'd check this: optab_handler (widen_sum_optab,
7926              vect_short_mode)).
7927
7928           This is contrary to "regular" reductions, in which the types of all
7929           the arguments are the same as the type of the reduction variable.
7930           For "regular" reductions we can therefore use the same vector type
7931           (and also the same tree-code) when generating the epilog code and
7932           when generating the code inside the loop.  */
7933
7934   code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
7935
7936   /* If conversion might have created a conditional operation like
7937      IFN_COND_ADD already.  Use the internal code for the following checks.  */
7938   if (orig_code.is_internal_fn ())
7939     {
7940       tree_code new_code = conditional_internal_fn_code (internal_fn (orig_code));
7941       orig_code = new_code != ERROR_MARK ? new_code : orig_code;
7942     }
7943
7944   STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
7945
7946   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7947   if (reduction_type == TREE_CODE_REDUCTION)
7948     {
7949       /* Check whether it's ok to change the order of the computation.
7950          Generally, when vectorizing a reduction we change the order of the
7951          computation.  This may change the behavior of the program in some
7952          cases, so we need to check that this is ok.  One exception is when
7953          vectorizing an outer-loop: the inner-loop is executed sequentially,
7954          and therefore vectorizing reductions in the inner-loop during
7955          outer-loop vectorization is safe.  Likewise when we are vectorizing
7956          a series of reductions using SLP and the VF is one the reductions
7957          are performed in scalar order.  */
7958       if (slp_node
7959           && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7960           && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
7961         ;
7962       else if (needs_fold_left_reduction_p (op.type, orig_code))
7963         {
7964           /* When vectorizing a reduction chain w/o SLP the reduction PHI
7965              is not directy used in stmt.  */
7966           if (!only_slp_reduc_chain
7967               && reduc_chain_length != 1)
7968             {
7969               if (dump_enabled_p ())
7970                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7971                                  "in-order reduction chain without SLP.\n");
7972               return false;
7973             }
7974           STMT_VINFO_REDUC_TYPE (reduc_info)
7975             = reduction_type = FOLD_LEFT_REDUCTION;
7976         }
7977       else if (!commutative_binary_op_p (orig_code, op.type)
7978                || !associative_binary_op_p (orig_code, op.type))
7979         {
7980           if (dump_enabled_p ())
7981             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7982                             "reduction: not commutative/associative\n");
7983           return false;
7984         }
7985     }
7986
7987   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7988       && ncopies > 1)
7989     {
7990       if (dump_enabled_p ())
7991         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7992                          "multiple types in double reduction or condition "
7993                          "reduction or fold-left reduction.\n");
7994       return false;
7995     }
7996
7997   internal_fn reduc_fn = IFN_LAST;
7998   if (reduction_type == TREE_CODE_REDUCTION
7999       || reduction_type == FOLD_LEFT_REDUCTION
8000       || reduction_type == INTEGER_INDUC_COND_REDUCTION
8001       || reduction_type == CONST_COND_REDUCTION)
8002     {
8003       if (reduction_type == FOLD_LEFT_REDUCTION
8004           ? fold_left_reduction_fn (orig_code, &reduc_fn)
8005           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
8006         {
8007           if (reduc_fn != IFN_LAST
8008               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
8009                                                   OPTIMIZE_FOR_SPEED))
8010             {
8011               if (dump_enabled_p ())
8012                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8013                                  "reduc op not supported by target.\n");
8014
8015               reduc_fn = IFN_LAST;
8016             }
8017         }
8018       else
8019         {
8020           if (!nested_cycle || double_reduc)
8021             {
8022               if (dump_enabled_p ())
8023                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8024                                  "no reduc code for scalar code.\n");
8025
8026               return false;
8027             }
8028         }
8029     }
8030   else if (reduction_type == COND_REDUCTION)
8031     {
8032       int scalar_precision
8033         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
8034       cr_index_scalar_type = make_unsigned_type (scalar_precision);
8035       cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
8036                                                 vectype_out);
8037
8038       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
8039                                           OPTIMIZE_FOR_SPEED))
8040         reduc_fn = IFN_REDUC_MAX;
8041     }
8042   STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
8043
8044   if (reduction_type != EXTRACT_LAST_REDUCTION
8045       && (!nested_cycle || double_reduc)
8046       && reduc_fn == IFN_LAST
8047       && !nunits_out.is_constant ())
8048     {
8049       if (dump_enabled_p ())
8050         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8051                          "missing target support for reduction on"
8052                          " variable-length vectors.\n");
8053       return false;
8054     }
8055
8056   /* For SLP reductions, see if there is a neutral value we can use.  */
8057   tree neutral_op = NULL_TREE;
8058   if (slp_node)
8059     {
8060       tree initial_value = NULL_TREE;
8061       if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
8062         initial_value = vect_phi_initial_value (reduc_def_phi);
8063       neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8064                                              orig_code, initial_value);
8065     }
8066
8067   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
8068     {
8069       /* We can't support in-order reductions of code such as this:
8070
8071            for (int i = 0; i < n1; ++i)
8072              for (int j = 0; j < n2; ++j)
8073                l += a[j];
8074
8075          since GCC effectively transforms the loop when vectorizing:
8076
8077            for (int i = 0; i < n1 / VF; ++i)
8078              for (int j = 0; j < n2; ++j)
8079                for (int k = 0; k < VF; ++k)
8080                  l += a[j];
8081
8082          which is a reassociation of the original operation.  */
8083       if (dump_enabled_p ())
8084         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8085                          "in-order double reduction not supported.\n");
8086
8087       return false;
8088     }
8089
8090   if (reduction_type == FOLD_LEFT_REDUCTION
8091       && slp_node
8092       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
8093     {
8094       /* We cannot use in-order reductions in this case because there is
8095          an implicit reassociation of the operations involved.  */
8096       if (dump_enabled_p ())
8097         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8098                          "in-order unchained SLP reductions not supported.\n");
8099       return false;
8100     }
8101
8102   /* For double reductions, and for SLP reductions with a neutral value,
8103      we construct a variable-length initial vector by loading a vector
8104      full of the neutral value and then shift-and-inserting the start
8105      values into the low-numbered elements.  */
8106   if ((double_reduc || neutral_op)
8107       && !nunits_out.is_constant ()
8108       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
8109                                           vectype_out, OPTIMIZE_FOR_SPEED))
8110     {
8111       if (dump_enabled_p ())
8112         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8113                          "reduction on variable-length vectors requires"
8114                          " target support for a vector-shift-and-insert"
8115                          " operation.\n");
8116       return false;
8117     }
8118
8119   /* Check extra constraints for variable-length unchained SLP reductions.  */
8120   if (slp_node
8121       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
8122       && !nunits_out.is_constant ())
8123     {
8124       /* We checked above that we could build the initial vector when
8125          there's a neutral element value.  Check here for the case in
8126          which each SLP statement has its own initial value and in which
8127          that value needs to be repeated for every instance of the
8128          statement within the initial vector.  */
8129       unsigned int group_size = SLP_TREE_LANES (slp_node);
8130       if (!neutral_op
8131           && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
8132                                               TREE_TYPE (vectype_out)))
8133         {
8134           if (dump_enabled_p ())
8135             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8136                              "unsupported form of SLP reduction for"
8137                              " variable-length vectors: cannot build"
8138                              " initial vector.\n");
8139           return false;
8140         }
8141       /* The epilogue code relies on the number of elements being a multiple
8142          of the group size.  The duplicate-and-interleave approach to setting
8143          up the initial vector does too.  */
8144       if (!multiple_p (nunits_out, group_size))
8145         {
8146           if (dump_enabled_p ())
8147             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8148                              "unsupported form of SLP reduction for"
8149                              " variable-length vectors: the vector size"
8150                              " is not a multiple of the number of results.\n");
8151           return false;
8152         }
8153     }
8154
8155   if (reduction_type == COND_REDUCTION)
8156     {
8157       widest_int ni;
8158
8159       if (! max_loop_iterations (loop, &ni))
8160         {
8161           if (dump_enabled_p ())
8162             dump_printf_loc (MSG_NOTE, vect_location,
8163                              "loop count not known, cannot create cond "
8164                              "reduction.\n");
8165           return false;
8166         }
8167       /* Convert backedges to iterations.  */
8168       ni += 1;
8169
8170       /* The additional index will be the same type as the condition.  Check
8171          that the loop can fit into this less one (because we'll use up the
8172          zero slot for when there are no matches).  */
8173       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
8174       if (wi::geu_p (ni, wi::to_widest (max_index)))
8175         {
8176           if (dump_enabled_p ())
8177             dump_printf_loc (MSG_NOTE, vect_location,
8178                              "loop size is greater than data size.\n");
8179           return false;
8180         }
8181     }
8182
8183   /* In case the vectorization factor (VF) is bigger than the number
8184      of elements that we can fit in a vectype (nunits), we have to generate
8185      more than one vector stmt - i.e - we need to "unroll" the
8186      vector stmt by a factor VF/nunits.  For more details see documentation
8187      in vectorizable_operation.  */
8188
8189   /* If the reduction is used in an outer loop we need to generate
8190      VF intermediate results, like so (e.g. for ncopies=2):
8191         r0 = phi (init, r0)
8192         r1 = phi (init, r1)
8193         r0 = x0 + r0;
8194         r1 = x1 + r1;
8195     (i.e. we generate VF results in 2 registers).
8196     In this case we have a separate def-use cycle for each copy, and therefore
8197     for each copy we get the vector def for the reduction variable from the
8198     respective phi node created for this copy.
8199
8200     Otherwise (the reduction is unused in the loop nest), we can combine
8201     together intermediate results, like so (e.g. for ncopies=2):
8202         r = phi (init, r)
8203         r = x0 + r;
8204         r = x1 + r;
8205    (i.e. we generate VF/2 results in a single register).
8206    In this case for each copy we get the vector def for the reduction variable
8207    from the vectorized reduction operation generated in the previous iteration.
8208
8209    This only works when we see both the reduction PHI and its only consumer
8210    in vectorizable_reduction and there are no intermediate stmts
8211    participating.  When unrolling we want each unrolled iteration to have its
8212    own reduction accumulator since one of the main goals of unrolling a
8213    reduction is to reduce the aggregate loop-carried latency.  */
8214   if (ncopies > 1
8215       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
8216       && reduc_chain_length == 1
8217       && loop_vinfo->suggested_unroll_factor == 1)
8218     single_defuse_cycle = true;
8219
8220   if (single_defuse_cycle || lane_reduc_code_p)
8221     {
8222       gcc_assert (op.code != COND_EXPR);
8223
8224       /* 4. Supportable by target?  */
8225       bool ok = true;
8226
8227       /* 4.1. check support for the operation in the loop
8228
8229          This isn't necessary for the lane reduction codes, since they
8230          can only be produced by pattern matching, and it's up to the
8231          pattern matcher to test for support.  The main reason for
8232          specifically skipping this step is to avoid rechecking whether
8233          mixed-sign dot-products can be implemented using signed
8234          dot-products.  */
8235       machine_mode vec_mode = TYPE_MODE (vectype_in);
8236       if (!lane_reduc_code_p
8237           && !directly_supported_p (op.code, vectype_in, optab_vector))
8238         {
8239           if (dump_enabled_p ())
8240             dump_printf (MSG_NOTE, "op not supported by target.\n");
8241           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
8242               || !vect_can_vectorize_without_simd_p (op.code))
8243             ok = false;
8244           else
8245             if (dump_enabled_p ())
8246               dump_printf (MSG_NOTE, "proceeding using word mode.\n");
8247         }
8248
8249       if (vect_emulated_vector_p (vectype_in)
8250           && !vect_can_vectorize_without_simd_p (op.code))
8251         {
8252           if (dump_enabled_p ())
8253             dump_printf (MSG_NOTE, "using word mode not possible.\n");
8254           return false;
8255         }
8256
8257       /* lane-reducing operations have to go through vect_transform_reduction.
8258          For the other cases try without the single cycle optimization.  */
8259       if (!ok)
8260         {
8261           if (lane_reduc_code_p)
8262             return false;
8263           else
8264             single_defuse_cycle = false;
8265         }
8266     }
8267   STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
8268
8269   /* If the reduction stmt is one of the patterns that have lane
8270      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
8271   if ((ncopies > 1 && ! single_defuse_cycle)
8272       && lane_reduc_code_p)
8273     {
8274       if (dump_enabled_p ())
8275         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8276                          "multi def-use cycle not possible for lane-reducing "
8277                          "reduction operation\n");
8278       return false;
8279     }
8280
8281   if (slp_node
8282       && !(!single_defuse_cycle
8283            && !lane_reduc_code_p
8284            && reduction_type != FOLD_LEFT_REDUCTION))
8285     for (i = 0; i < (int) op.num_ops; i++)
8286       if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
8287         {
8288           if (dump_enabled_p ())
8289             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8290                              "incompatible vector types for invariants\n");
8291           return false;
8292         }
8293
8294   if (slp_node)
8295     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8296   else
8297     vec_num = 1;
8298
8299   vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
8300                              reduction_type, ncopies, cost_vec);
8301   /* Cost the reduction op inside the loop if transformed via
8302      vect_transform_reduction.  Otherwise this is costed by the
8303      separate vectorizable_* routines.  */
8304   if (single_defuse_cycle || lane_reduc_code_p)
8305     {
8306       int factor = 1;
8307       if (vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info))
8308         /* Three dot-products and a subtraction.  */
8309         factor = 4;
8310       record_stmt_cost (cost_vec, ncopies * factor, vector_stmt,
8311                         stmt_info, 0, vect_body);
8312     }
8313
8314   if (dump_enabled_p ()
8315       && reduction_type == FOLD_LEFT_REDUCTION)
8316     dump_printf_loc (MSG_NOTE, vect_location,
8317                      "using an in-order (fold-left) reduction.\n");
8318   STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
8319   /* All but single defuse-cycle optimized, lane-reducing and fold-left
8320      reductions go through their own vectorizable_* routines.  */
8321   if (!single_defuse_cycle
8322       && !lane_reduc_code_p
8323       && reduction_type != FOLD_LEFT_REDUCTION)
8324     {
8325       stmt_vec_info tem
8326         = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
8327       if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
8328         {
8329           gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
8330           tem = REDUC_GROUP_FIRST_ELEMENT (tem);
8331         }
8332       STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
8333       STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
8334     }
8335   else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8336     {
8337       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8338       vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8339       internal_fn cond_fn = get_conditional_internal_fn (op.code, op.type);
8340
8341       if (reduction_type != FOLD_LEFT_REDUCTION
8342           && !use_mask_by_cond_expr_p (op.code, cond_fn, vectype_in)
8343           && (cond_fn == IFN_LAST
8344               || !direct_internal_fn_supported_p (cond_fn, vectype_in,
8345                                                   OPTIMIZE_FOR_SPEED)))
8346         {
8347           if (dump_enabled_p ())
8348             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8349                              "can't operate on partial vectors because"
8350                              " no conditional operation is available.\n");
8351           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8352         }
8353       else if (reduction_type == FOLD_LEFT_REDUCTION
8354                && reduc_fn == IFN_LAST
8355                && !expand_vec_cond_expr_p (vectype_in,
8356                                            truth_type_for (vectype_in),
8357                                            SSA_NAME))
8358         {
8359           if (dump_enabled_p ())
8360             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8361                              "can't operate on partial vectors because"
8362                              " no conditional operation is available.\n");
8363           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8364         }
8365       else if (reduction_type == FOLD_LEFT_REDUCTION
8366                && internal_fn_mask_index (reduc_fn) == -1
8367                && FLOAT_TYPE_P (vectype_in)
8368                && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in))
8369         {
8370           if (dump_enabled_p ())
8371             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8372                              "can't operate on partial vectors because"
8373                              " signed zeros cannot be preserved.\n");
8374           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8375         }
8376       else
8377         {
8378           internal_fn mask_reduc_fn
8379             = get_masked_reduction_fn (reduc_fn, vectype_in);
8380
8381           if (mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
8382             vect_record_loop_len (loop_vinfo, lens, ncopies * vec_num,
8383                                   vectype_in, 1);
8384           else
8385             vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
8386                                    vectype_in, NULL);
8387         }
8388     }
8389   return true;
8390 }
8391
8392 /* STMT_INFO is a dot-product reduction whose multiplication operands
8393    have different signs.  Emit a sequence to emulate the operation
8394    using a series of signed DOT_PROD_EXPRs and return the last
8395    statement generated.  VEC_DEST is the result of the vector operation
8396    and VOP lists its inputs.  */
8397
8398 static gassign *
8399 vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8400                              gimple_stmt_iterator *gsi, tree vec_dest,
8401                              tree vop[3])
8402 {
8403   tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest));
8404   tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0]));
8405   tree narrow_elttype = TREE_TYPE (narrow_vectype);
8406   gimple *new_stmt;
8407
8408   /* Make VOP[0] the unsigned operand VOP[1] the signed operand.  */
8409   if (!TYPE_UNSIGNED (TREE_TYPE (vop[0])))
8410     std::swap (vop[0], vop[1]);
8411
8412   /* Convert all inputs to signed types.  */
8413   for (int i = 0; i < 3; ++i)
8414     if (TYPE_UNSIGNED (TREE_TYPE (vop[i])))
8415       {
8416         tree tmp = make_ssa_name (signed_type_for (TREE_TYPE (vop[i])));
8417         new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]);
8418         vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8419         vop[i] = tmp;
8420       }
8421
8422   /* In the comments below we assume 8-bit inputs for simplicity,
8423      but the approach works for any full integer type.  */
8424
8425   /* Create a vector of -128.  */
8426   tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype);
8427   tree min_narrow = build_vector_from_val (narrow_vectype,
8428                                            min_narrow_elttype);
8429
8430   /* Create a vector of 64.  */
8431   auto half_wi = wi::lrshift (wi::to_wide (min_narrow_elttype), 1);
8432   tree half_narrow = wide_int_to_tree (narrow_elttype, half_wi);
8433   half_narrow = build_vector_from_val (narrow_vectype, half_narrow);
8434
8435   /* Emit: SUB_RES = VOP[0] - 128.  */
8436   tree sub_res = make_ssa_name (narrow_vectype);
8437   new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow);
8438   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8439
8440   /* Emit:
8441
8442        STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
8443        STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
8444        STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
8445
8446      on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
8447      Doing the two 64 * y steps first allows more time to compute x.  */
8448   tree stage1 = make_ssa_name (wide_vectype);
8449   new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR,
8450                                   vop[1], half_narrow, vop[2]);
8451   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8452
8453   tree stage2 = make_ssa_name (wide_vectype);
8454   new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR,
8455                                   vop[1], half_narrow, stage1);
8456   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8457
8458   tree stage3 = make_ssa_name (wide_vectype);
8459   new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR,
8460                                   sub_res, vop[1], stage2);
8461   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8462
8463   /* Convert STAGE3 to the reduction type.  */
8464   return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3);
8465 }
8466
8467 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
8468    value.  */
8469
8470 bool
8471 vect_transform_reduction (loop_vec_info loop_vinfo,
8472                           stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
8473                           gimple **vec_stmt, slp_tree slp_node)
8474 {
8475   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8476   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8477   int i;
8478   int ncopies;
8479   int vec_num;
8480
8481   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8482   gcc_assert (reduc_info->is_reduc_info);
8483
8484   if (nested_in_vect_loop_p (loop, stmt_info))
8485     {
8486       loop = loop->inner;
8487       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
8488     }
8489
8490   gimple_match_op op;
8491   if (!gimple_extract_op (stmt_info->stmt, &op))
8492     gcc_unreachable ();
8493
8494   /* All uses but the last are expected to be defined in the loop.
8495      The last use is the reduction variable.  In case of nested cycle this
8496      assumption is not true: we use reduc_index to record the index of the
8497      reduction variable.  */
8498   stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
8499   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
8500   int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
8501   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8502
8503   if (slp_node)
8504     {
8505       ncopies = 1;
8506       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8507     }
8508   else
8509     {
8510       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8511       vec_num = 1;
8512     }
8513
8514   code_helper code = canonicalize_code (op.code, op.type);
8515   internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
8516
8517   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8518   vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8519   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
8520
8521   /* Transform.  */
8522   tree new_temp = NULL_TREE;
8523   auto_vec<tree> vec_oprnds0;
8524   auto_vec<tree> vec_oprnds1;
8525   auto_vec<tree> vec_oprnds2;
8526   tree def0;
8527
8528   if (dump_enabled_p ())
8529     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
8530
8531   /* FORNOW: Multiple types are not supported for condition.  */
8532   if (code == COND_EXPR)
8533     gcc_assert (ncopies == 1);
8534
8535   /* A binary COND_OP reduction must have the same definition and else
8536      value. */
8537   bool cond_fn_p = code.is_internal_fn ()
8538     && conditional_internal_fn_code (internal_fn (code)) != ERROR_MARK;
8539   if (cond_fn_p)
8540     {
8541       gcc_assert (code == IFN_COND_ADD || code == IFN_COND_SUB
8542                   || code == IFN_COND_MUL || code == IFN_COND_AND
8543                   || code == IFN_COND_IOR || code == IFN_COND_XOR);
8544       gcc_assert (op.num_ops == 4
8545                   && (op.ops[reduc_index]
8546                       == op.ops[internal_fn_else_index ((internal_fn) code)]));
8547     }
8548
8549   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8550
8551   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
8552   if (reduction_type == FOLD_LEFT_REDUCTION)
8553     {
8554       internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
8555       gcc_assert (code.is_tree_code () || cond_fn_p);
8556       return vectorize_fold_left_reduction
8557           (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
8558            code, reduc_fn, op.ops, op.num_ops, vectype_in,
8559            reduc_index, masks, lens);
8560     }
8561
8562   bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
8563   gcc_assert (single_defuse_cycle
8564               || code == DOT_PROD_EXPR
8565               || code == WIDEN_SUM_EXPR
8566               || code == SAD_EXPR);
8567
8568   /* Create the destination vector  */
8569   tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
8570   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
8571
8572   /* Get NCOPIES vector definitions for all operands except the reduction
8573      definition.  */
8574   if (!cond_fn_p)
8575     {
8576       vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
8577                          single_defuse_cycle && reduc_index == 0
8578                          ? NULL_TREE : op.ops[0], &vec_oprnds0,
8579                          single_defuse_cycle && reduc_index == 1
8580                          ? NULL_TREE : op.ops[1], &vec_oprnds1,
8581                          op.num_ops == 3
8582                          && !(single_defuse_cycle && reduc_index == 2)
8583                          ? op.ops[2] : NULL_TREE, &vec_oprnds2);
8584     }
8585   else
8586     {
8587       /* For a conditional operation pass the truth type as mask
8588          vectype.  */
8589       gcc_assert (single_defuse_cycle
8590                   && (reduc_index == 1 || reduc_index == 2));
8591       vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
8592                          op.ops[0], truth_type_for (vectype_in), &vec_oprnds0,
8593                          reduc_index == 1 ? NULL_TREE : op.ops[1],
8594                          NULL_TREE, &vec_oprnds1,
8595                          reduc_index == 2 ? NULL_TREE : op.ops[2],
8596                          NULL_TREE, &vec_oprnds2);
8597     }
8598
8599   /* For single def-use cycles get one copy of the vectorized reduction
8600      definition.  */
8601   if (single_defuse_cycle)
8602     {
8603       gcc_assert (!slp_node);
8604       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8605                                      op.ops[reduc_index],
8606                                      reduc_index == 0 ? &vec_oprnds0
8607                                      : (reduc_index == 1 ? &vec_oprnds1
8608                                         : &vec_oprnds2));
8609     }
8610
8611   bool emulated_mixed_dot_prod
8612     = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
8613   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
8614     {
8615       gimple *new_stmt;
8616       tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
8617       if (masked_loop_p && !mask_by_cond_expr)
8618         {
8619           /* No conditional ifns have been defined for dot-product yet.  */
8620           gcc_assert (code != DOT_PROD_EXPR);
8621
8622           /* Make sure that the reduction accumulator is vop[0].  */
8623           if (reduc_index == 1)
8624             {
8625               gcc_assert (commutative_binary_op_p (code, op.type));
8626               std::swap (vop[0], vop[1]);
8627             }
8628           tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8629                                           vec_num * ncopies, vectype_in, i);
8630           gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
8631                                                     vop[0], vop[1], vop[0]);
8632           new_temp = make_ssa_name (vec_dest, call);
8633           gimple_call_set_lhs (call, new_temp);
8634           gimple_call_set_nothrow (call, true);
8635           vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
8636           new_stmt = call;
8637         }
8638       else
8639         {
8640           if (op.num_ops >= 3)
8641             vop[2] = vec_oprnds2[i];
8642
8643           if (masked_loop_p && mask_by_cond_expr)
8644             {
8645               tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8646                                               vec_num * ncopies, vectype_in, i);
8647               build_vect_cond_expr (code, vop, mask, gsi);
8648             }
8649
8650           if (emulated_mixed_dot_prod)
8651             new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
8652                                                     vec_dest, vop);
8653
8654           else if (code.is_internal_fn () && !cond_fn_p)
8655             new_stmt = gimple_build_call_internal (internal_fn (code),
8656                                                    op.num_ops,
8657                                                    vop[0], vop[1], vop[2]);
8658           else if (code.is_internal_fn () && cond_fn_p)
8659             new_stmt = gimple_build_call_internal (internal_fn (code),
8660                                                    op.num_ops,
8661                                                    vop[0], vop[1], vop[2],
8662                                                    vop[1]);
8663           else
8664             new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
8665                                             vop[0], vop[1], vop[2]);
8666           new_temp = make_ssa_name (vec_dest, new_stmt);
8667           gimple_set_lhs (new_stmt, new_temp);
8668           vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8669         }
8670
8671       if (slp_node)
8672         slp_node->push_vec_def (new_stmt);
8673       else if (single_defuse_cycle
8674                && i < ncopies - 1)
8675         {
8676           if (reduc_index == 0)
8677             vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
8678           else if (reduc_index == 1)
8679             vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
8680           else if (reduc_index == 2)
8681             vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
8682         }
8683       else
8684         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8685     }
8686
8687   if (!slp_node)
8688     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8689
8690   return true;
8691 }
8692
8693 /* Transform phase of a cycle PHI.  */
8694
8695 bool
8696 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
8697                           stmt_vec_info stmt_info, gimple **vec_stmt,
8698                           slp_tree slp_node, slp_instance slp_node_instance)
8699 {
8700   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8701   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8702   int i;
8703   int ncopies;
8704   int j;
8705   bool nested_cycle = false;
8706   int vec_num;
8707
8708   if (nested_in_vect_loop_p (loop, stmt_info))
8709     {
8710       loop = loop->inner;
8711       nested_cycle = true;
8712     }
8713
8714   stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
8715   reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
8716   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8717   gcc_assert (reduc_info->is_reduc_info);
8718
8719   if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
8720       || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
8721     /* Leave the scalar phi in place.  */
8722     return true;
8723
8724   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8725   /* For a nested cycle we do not fill the above.  */
8726   if (!vectype_in)
8727     vectype_in = STMT_VINFO_VECTYPE (stmt_info);
8728   gcc_assert (vectype_in);
8729
8730   if (slp_node)
8731     {
8732       /* The size vect_schedule_slp_instance computes is off for us.  */
8733       vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
8734                                       * SLP_TREE_LANES (slp_node), vectype_in);
8735       ncopies = 1;
8736     }
8737   else
8738     {
8739       vec_num = 1;
8740       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8741     }
8742
8743   /* Check whether we should use a single PHI node and accumulate
8744      vectors to one before the backedge.  */
8745   if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
8746     ncopies = 1;
8747
8748   /* Create the destination vector  */
8749   gphi *phi = as_a <gphi *> (stmt_info->stmt);
8750   tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
8751                                                vectype_out);
8752
8753   /* Get the loop-entry arguments.  */
8754   tree vec_initial_def = NULL_TREE;
8755   auto_vec<tree> vec_initial_defs;
8756   if (slp_node)
8757     {
8758       vec_initial_defs.reserve (vec_num);
8759       if (nested_cycle)
8760         {
8761           unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
8762           vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
8763                              &vec_initial_defs);
8764         }
8765       else
8766         {
8767           gcc_assert (slp_node == slp_node_instance->reduc_phis);
8768           vec<tree> &initial_values = reduc_info->reduc_initial_values;
8769           vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
8770
8771           unsigned int num_phis = stmts.length ();
8772           if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
8773             num_phis = 1;
8774           initial_values.reserve (num_phis);
8775           for (unsigned int i = 0; i < num_phis; ++i)
8776             {
8777               gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
8778               initial_values.quick_push (vect_phi_initial_value (this_phi));
8779             }
8780           if (vec_num == 1)
8781             vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8782           if (!initial_values.is_empty ())
8783             {
8784               tree initial_value
8785                 = (num_phis == 1 ? initial_values[0] : NULL_TREE);
8786               code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8787               tree neutral_op
8788                 = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8789                                             code, initial_value);
8790               get_initial_defs_for_reduction (loop_vinfo, reduc_info,
8791                                               &vec_initial_defs, vec_num,
8792                                               stmts.length (), neutral_op);
8793             }
8794         }
8795     }
8796   else
8797     {
8798       /* Get at the scalar def before the loop, that defines the initial
8799          value of the reduction variable.  */
8800       tree initial_def = vect_phi_initial_value (phi);
8801       reduc_info->reduc_initial_values.safe_push (initial_def);
8802       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
8803          and we can't use zero for induc_val, use initial_def.  Similarly
8804          for REDUC_MIN and initial_def larger than the base.  */
8805       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
8806         {
8807           tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
8808           if (TREE_CODE (initial_def) == INTEGER_CST
8809               && !integer_zerop (induc_val)
8810               && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
8811                    && tree_int_cst_lt (initial_def, induc_val))
8812                   || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
8813                       && tree_int_cst_lt (induc_val, initial_def))))
8814             {
8815               induc_val = initial_def;
8816               /* Communicate we used the initial_def to epilouge
8817                  generation.  */
8818               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
8819             }
8820           vec_initial_def = build_vector_from_val (vectype_out, induc_val);
8821         }
8822       else if (nested_cycle)
8823         {
8824           /* Do not use an adjustment def as that case is not supported
8825              correctly if ncopies is not one.  */
8826           vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
8827                                          ncopies, initial_def,
8828                                          &vec_initial_defs);
8829         }
8830       else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
8831                || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
8832         /* Fill the initial vector with the initial scalar value.  */
8833         vec_initial_def
8834           = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
8835                                            initial_def, initial_def);
8836       else
8837         {
8838           if (ncopies == 1)
8839             vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8840           if (!reduc_info->reduc_initial_values.is_empty ())
8841             {
8842               initial_def = reduc_info->reduc_initial_values[0];
8843               code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8844               tree neutral_op
8845                 = neutral_op_for_reduction (TREE_TYPE (initial_def),
8846                                             code, initial_def);
8847               gcc_assert (neutral_op);
8848               /* Try to simplify the vector initialization by applying an
8849                  adjustment after the reduction has been performed.  */
8850               if (!reduc_info->reused_accumulator
8851                   && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8852                   && !operand_equal_p (neutral_op, initial_def))
8853                 {
8854                   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
8855                     = initial_def;
8856                   initial_def = neutral_op;
8857                 }
8858               vec_initial_def
8859                 = get_initial_def_for_reduction (loop_vinfo, reduc_info,
8860                                                  initial_def, neutral_op);
8861             }
8862         }
8863     }
8864
8865   if (vec_initial_def)
8866     {
8867       vec_initial_defs.create (ncopies);
8868       for (i = 0; i < ncopies; ++i)
8869         vec_initial_defs.quick_push (vec_initial_def);
8870     }
8871
8872   if (auto *accumulator = reduc_info->reused_accumulator)
8873     {
8874       tree def = accumulator->reduc_input;
8875       if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8876         {
8877           unsigned int nreduc;
8878           bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
8879                                             (TREE_TYPE (def)),
8880                                           TYPE_VECTOR_SUBPARTS (vectype_out),
8881                                           &nreduc);
8882           gcc_assert (res);
8883           gimple_seq stmts = NULL;
8884           /* Reduce the single vector to a smaller one.  */
8885           if (nreduc != 1)
8886             {
8887               /* Perform the reduction in the appropriate type.  */
8888               tree rvectype = vectype_out;
8889               if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
8890                                               TREE_TYPE (TREE_TYPE (def))))
8891                 rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
8892                                               TYPE_VECTOR_SUBPARTS
8893                                                 (vectype_out));
8894               def = vect_create_partial_epilog (def, rvectype,
8895                                                 STMT_VINFO_REDUC_CODE
8896                                                   (reduc_info),
8897                                                 &stmts);
8898             }
8899           /* The epilogue loop might use a different vector mode, like
8900              VNx2DI vs. V2DI.  */
8901           if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
8902             {
8903               tree reduc_type = build_vector_type_for_mode
8904                 (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
8905               def = gimple_convert (&stmts, reduc_type, def);
8906             }
8907           /* Adjust the input so we pick up the partially reduced value
8908              for the skip edge in vect_create_epilog_for_reduction.  */
8909           accumulator->reduc_input = def;
8910           /* And the reduction could be carried out using a different sign.  */
8911           if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8912             def = gimple_convert (&stmts, vectype_out, def);
8913           if (loop_vinfo->main_loop_edge)
8914             {
8915               /* While we'd like to insert on the edge this will split
8916                  blocks and disturb bookkeeping, we also will eventually
8917                  need this on the skip edge.  Rely on sinking to
8918                  fixup optimal placement and insert in the pred.  */
8919               gimple_stmt_iterator gsi
8920                 = gsi_last_bb (loop_vinfo->main_loop_edge->src);
8921               /* Insert before a cond that eventually skips the
8922                  epilogue.  */
8923               if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
8924                 gsi_prev (&gsi);
8925               gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
8926             }
8927           else
8928             gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
8929                                               stmts);
8930         }
8931       if (loop_vinfo->main_loop_edge)
8932         vec_initial_defs[0]
8933           = vect_get_main_loop_result (loop_vinfo, def,
8934                                        vec_initial_defs[0]);
8935       else
8936         vec_initial_defs.safe_push (def);
8937     }
8938
8939   /* Generate the reduction PHIs upfront.  */
8940   for (i = 0; i < vec_num; i++)
8941     {
8942       tree vec_init_def = vec_initial_defs[i];
8943       for (j = 0; j < ncopies; j++)
8944         {
8945           /* Create the reduction-phi that defines the reduction
8946              operand.  */
8947           gphi *new_phi = create_phi_node (vec_dest, loop->header);
8948
8949           /* Set the loop-entry arg of the reduction-phi.  */
8950           if (j != 0 && nested_cycle)
8951             vec_init_def = vec_initial_defs[j];
8952           add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
8953                        UNKNOWN_LOCATION);
8954
8955           /* The loop-latch arg is set in epilogue processing.  */
8956
8957           if (slp_node)
8958             slp_node->push_vec_def (new_phi);
8959           else
8960             {
8961               if (j == 0)
8962                 *vec_stmt = new_phi;
8963               STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8964             }
8965         }
8966     }
8967
8968   return true;
8969 }
8970
8971 /* Vectorizes LC PHIs.  */
8972
8973 bool
8974 vectorizable_lc_phi (loop_vec_info loop_vinfo,
8975                      stmt_vec_info stmt_info, gimple **vec_stmt,
8976                      slp_tree slp_node)
8977 {
8978   if (!loop_vinfo
8979       || !is_a <gphi *> (stmt_info->stmt)
8980       || gimple_phi_num_args (stmt_info->stmt) != 1)
8981     return false;
8982
8983   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
8984       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
8985     return false;
8986
8987   if (!vec_stmt) /* transformation not required.  */
8988     {
8989       /* Deal with copies from externs or constants that disguise as
8990          loop-closed PHI nodes (PR97886).  */
8991       if (slp_node
8992           && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
8993                                                 SLP_TREE_VECTYPE (slp_node)))
8994         {
8995           if (dump_enabled_p ())
8996             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8997                              "incompatible vector types for invariants\n");
8998           return false;
8999         }
9000       STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
9001       return true;
9002     }
9003
9004   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9005   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
9006   basic_block bb = gimple_bb (stmt_info->stmt);
9007   edge e = single_pred_edge (bb);
9008   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
9009   auto_vec<tree> vec_oprnds;
9010   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
9011                      !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
9012                      gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
9013   for (unsigned i = 0; i < vec_oprnds.length (); i++)
9014     {
9015       /* Create the vectorized LC PHI node.  */
9016       gphi *new_phi = create_phi_node (vec_dest, bb);
9017       add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
9018       if (slp_node)
9019         slp_node->push_vec_def (new_phi);
9020       else
9021         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
9022     }
9023   if (!slp_node)
9024     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
9025
9026   return true;
9027 }
9028
9029 /* Vectorizes PHIs.  */
9030
9031 bool
9032 vectorizable_phi (vec_info *,
9033                   stmt_vec_info stmt_info, gimple **vec_stmt,
9034                   slp_tree slp_node, stmt_vector_for_cost *cost_vec)
9035 {
9036   if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
9037     return false;
9038
9039   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
9040     return false;
9041
9042   tree vectype = SLP_TREE_VECTYPE (slp_node);
9043
9044   if (!vec_stmt) /* transformation not required.  */
9045     {
9046       slp_tree child;
9047       unsigned i;
9048       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
9049         if (!child)
9050           {
9051             if (dump_enabled_p ())
9052               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9053                                "PHI node with unvectorized backedge def\n");
9054             return false;
9055           }
9056         else if (!vect_maybe_update_slp_op_vectype (child, vectype))
9057           {
9058             if (dump_enabled_p ())
9059               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9060                                "incompatible vector types for invariants\n");
9061             return false;
9062           }
9063         else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
9064                  && !useless_type_conversion_p (vectype,
9065                                                 SLP_TREE_VECTYPE (child)))
9066           {
9067             /* With bools we can have mask and non-mask precision vectors
9068                or different non-mask precisions.  while pattern recog is
9069                supposed to guarantee consistency here bugs in it can cause
9070                mismatches (PR103489 and PR103800 for example).
9071                Deal with them here instead of ICEing later.  */
9072             if (dump_enabled_p ())
9073               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9074                                "incompatible vector type setup from "
9075                                "bool pattern detection\n");
9076             return false;
9077           }
9078
9079       /* For single-argument PHIs assume coalescing which means zero cost
9080          for the scalar and the vector PHIs.  This avoids artificially
9081          favoring the vector path (but may pessimize it in some cases).  */
9082       if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
9083         record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
9084                           vector_stmt, stmt_info, vectype, 0, vect_body);
9085       STMT_VINFO_TYPE (stmt_info) = phi_info_type;
9086       return true;
9087     }
9088
9089   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
9090   basic_block bb = gimple_bb (stmt_info->stmt);
9091   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
9092   auto_vec<gphi *> new_phis;
9093   for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
9094     {
9095       slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
9096
9097       /* Skip not yet vectorized defs.  */
9098       if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
9099           && SLP_TREE_VEC_DEFS (child).is_empty ())
9100         continue;
9101
9102       auto_vec<tree> vec_oprnds;
9103       vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
9104       if (!new_phis.exists ())
9105         {
9106           new_phis.create (vec_oprnds.length ());
9107           for (unsigned j = 0; j < vec_oprnds.length (); j++)
9108             {
9109               /* Create the vectorized LC PHI node.  */
9110               new_phis.quick_push (create_phi_node (vec_dest, bb));
9111               slp_node->push_vec_def (new_phis[j]);
9112             }
9113         }
9114       edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
9115       for (unsigned j = 0; j < vec_oprnds.length (); j++)
9116         add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
9117     }
9118   /* We should have at least one already vectorized child.  */
9119   gcc_assert (new_phis.exists ());
9120
9121   return true;
9122 }
9123
9124 /* Vectorizes first order recurrences.  An overview of the transformation
9125    is described below. Suppose we have the following loop.
9126
9127      int t = 0;
9128      for (int i = 0; i < n; ++i)
9129        {
9130          b[i] = a[i] - t;
9131          t = a[i];
9132        }
9133
9134    There is a first-order recurrence on 'a'. For this loop, the scalar IR
9135    looks (simplified) like:
9136
9137     scalar.preheader:
9138       init = 0;
9139
9140     scalar.body:
9141       i = PHI <0(scalar.preheader), i+1(scalar.body)>
9142       _2 = PHI <(init(scalar.preheader), <_1(scalar.body)>
9143       _1 = a[i]
9144       b[i] = _1 - _2
9145       if (i < n) goto scalar.body
9146
9147    In this example, _2 is a recurrence because it's value depends on the
9148    previous iteration.  We vectorize this as (VF = 4)
9149
9150     vector.preheader:
9151       vect_init = vect_cst(..., ..., ..., 0)
9152
9153     vector.body
9154       i = PHI <0(vector.preheader), i+4(vector.body)>
9155       vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)>
9156       vect_2 = a[i, i+1, i+2, i+3];
9157       vect_3 = vec_perm (vect_1, vect_2, { 3, 4, 5, 6 })
9158       b[i, i+1, i+2, i+3] = vect_2 - vect_3
9159       if (..) goto vector.body
9160
9161    In this function, vectorizable_recurr, we code generate both the
9162    vector PHI node and the permute since those together compute the
9163    vectorized value of the scalar PHI.  We do not yet have the
9164    backedge value to fill in there nor into the vec_perm.  Those
9165    are filled in maybe_set_vectorized_backedge_value and
9166    vect_schedule_scc.
9167
9168    TODO:  Since the scalar loop does not have a use of the recurrence
9169    outside of the loop the natural way to implement peeling via
9170    vectorizing the live value doesn't work.  For now peeling of loops
9171    with a recurrence is not implemented.  For SLP the supported cases
9172    are restricted to those requiring a single vector recurrence PHI.  */
9173
9174 bool
9175 vectorizable_recurr (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
9176                      gimple **vec_stmt, slp_tree slp_node,
9177                      stmt_vector_for_cost *cost_vec)
9178 {
9179   if (!loop_vinfo || !is_a<gphi *> (stmt_info->stmt))
9180     return false;
9181
9182   gphi *phi = as_a<gphi *> (stmt_info->stmt);
9183
9184   /* So far we only support first-order recurrence auto-vectorization.  */
9185   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
9186     return false;
9187
9188   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9189   unsigned ncopies;
9190   if (slp_node)
9191     ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
9192   else
9193     ncopies = vect_get_num_copies (loop_vinfo, vectype);
9194   poly_int64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9195   unsigned dist = slp_node ? SLP_TREE_LANES (slp_node) : 1;
9196   /* We need to be able to make progress with a single vector.  */
9197   if (maybe_gt (dist * 2, nunits))
9198     {
9199       if (dump_enabled_p ())
9200         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9201                          "first order recurrence exceeds half of "
9202                          "a vector\n");
9203       return false;
9204     }
9205
9206   /* First-order recurrence autovectorization needs to handle permutation
9207      with indices = [nunits-1, nunits, nunits+1, ...].  */
9208   vec_perm_builder sel (nunits, 1, 3);
9209   for (int i = 0; i < 3; ++i)
9210     sel.quick_push (nunits - dist + i);
9211   vec_perm_indices indices (sel, 2, nunits);
9212
9213   if (!vec_stmt) /* transformation not required.  */
9214     {
9215       if (!can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
9216                                  indices))
9217         return false;
9218
9219       if (slp_node)
9220         {
9221           /* We eventually need to set a vector type on invariant
9222              arguments.  */
9223           unsigned j;
9224           slp_tree child;
9225           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9226             if (!vect_maybe_update_slp_op_vectype
9227                   (child, SLP_TREE_VECTYPE (slp_node)))
9228               {
9229                 if (dump_enabled_p ())
9230                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9231                                    "incompatible vector types for "
9232                                    "invariants\n");
9233                 return false;
9234               }
9235         }
9236       /* The recurrence costs the initialization vector and one permute
9237          for each copy.  */
9238       unsigned prologue_cost = record_stmt_cost (cost_vec, 1, scalar_to_vec,
9239                                                  stmt_info, 0, vect_prologue);
9240       unsigned inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9241                                                stmt_info, 0, vect_body);
9242       if (dump_enabled_p ())
9243         dump_printf_loc (MSG_NOTE, vect_location,
9244                          "vectorizable_recurr: inside_cost = %d, "
9245                          "prologue_cost = %d .\n", inside_cost,
9246                          prologue_cost);
9247
9248       STMT_VINFO_TYPE (stmt_info) = recurr_info_type;
9249       return true;
9250     }
9251
9252   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
9253   basic_block bb = gimple_bb (phi);
9254   tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
9255   if (!useless_type_conversion_p (TREE_TYPE (vectype), TREE_TYPE (preheader)))
9256     {
9257       gimple_seq stmts = NULL;
9258       preheader = gimple_convert (&stmts, TREE_TYPE (vectype), preheader);
9259       gsi_insert_seq_on_edge_immediate (pe, stmts);
9260     }
9261   tree vec_init = build_vector_from_val (vectype, preheader);
9262   vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL);
9263
9264   /* Create the vectorized first-order PHI node.  */
9265   tree vec_dest = vect_get_new_vect_var (vectype,
9266                                          vect_simple_var, "vec_recur_");
9267   gphi *new_phi = create_phi_node (vec_dest, bb);
9268   add_phi_arg (new_phi, vec_init, pe, UNKNOWN_LOCATION);
9269
9270   /* Insert shuffles the first-order recurrence autovectorization.
9271        result = VEC_PERM <vec_recur, vect_1, index[nunits-1, nunits, ...]>.  */
9272   tree perm = vect_gen_perm_mask_checked (vectype, indices);
9273
9274   /* Insert the required permute after the latch definition.  The
9275      second and later operands are tentative and will be updated when we have
9276      vectorized the latch definition.  */
9277   edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
9278   gimple *latch_def = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, le));
9279   gimple_stmt_iterator gsi2 = gsi_for_stmt (latch_def);
9280   gsi_next (&gsi2);
9281
9282   for (unsigned i = 0; i < ncopies; ++i)
9283     {
9284       vec_dest = make_ssa_name (vectype);
9285       gassign *vperm
9286           = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
9287                                  i == 0 ? gimple_phi_result (new_phi) : NULL,
9288                                  NULL, perm);
9289       vect_finish_stmt_generation (loop_vinfo, stmt_info, vperm, &gsi2);
9290
9291       if (slp_node)
9292         slp_node->push_vec_def (vperm);
9293       else
9294         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (vperm);
9295     }
9296
9297   if (!slp_node)
9298     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
9299   return true;
9300 }
9301
9302 /* Return true if VECTYPE represents a vector that requires lowering
9303    by the vector lowering pass.  */
9304
9305 bool
9306 vect_emulated_vector_p (tree vectype)
9307 {
9308   return (!VECTOR_MODE_P (TYPE_MODE (vectype))
9309           && (!VECTOR_BOOLEAN_TYPE_P (vectype)
9310               || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
9311 }
9312
9313 /* Return true if we can emulate CODE on an integer mode representation
9314    of a vector.  */
9315
9316 bool
9317 vect_can_vectorize_without_simd_p (tree_code code)
9318 {
9319   switch (code)
9320     {
9321     case PLUS_EXPR:
9322     case MINUS_EXPR:
9323     case NEGATE_EXPR:
9324     case BIT_AND_EXPR:
9325     case BIT_IOR_EXPR:
9326     case BIT_XOR_EXPR:
9327     case BIT_NOT_EXPR:
9328       return true;
9329
9330     default:
9331       return false;
9332     }
9333 }
9334
9335 /* Likewise, but taking a code_helper.  */
9336
9337 bool
9338 vect_can_vectorize_without_simd_p (code_helper code)
9339 {
9340   return (code.is_tree_code ()
9341           && vect_can_vectorize_without_simd_p (tree_code (code)));
9342 }
9343
9344 /* Create vector init for vectorized iv.  */
9345 static tree
9346 vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9347                                tree step_expr, poly_uint64 nunits,
9348                                tree vectype,
9349                                enum vect_induction_op_type induction_type)
9350 {
9351   unsigned HOST_WIDE_INT const_nunits;
9352   tree vec_shift, vec_init, new_name;
9353   unsigned i;
9354   tree itype = TREE_TYPE (vectype);
9355
9356   /* iv_loop is the loop to be vectorized. Create:
9357      vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr).  */
9358   new_name = gimple_convert (stmts, itype, init_expr);
9359   switch (induction_type)
9360     {
9361     case vect_step_op_shr:
9362     case vect_step_op_shl:
9363       /* Build the Initial value from shift_expr.  */
9364       vec_init = gimple_build_vector_from_val (stmts,
9365                                                vectype,
9366                                                new_name);
9367       vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
9368                                 build_zero_cst (itype), step_expr);
9369       vec_init = gimple_build (stmts,
9370                                (induction_type == vect_step_op_shr
9371                                 ? RSHIFT_EXPR : LSHIFT_EXPR),
9372                                vectype, vec_init, vec_shift);
9373       break;
9374
9375     case vect_step_op_neg:
9376       {
9377         vec_init = gimple_build_vector_from_val (stmts,
9378                                                  vectype,
9379                                                  new_name);
9380         tree vec_neg = gimple_build (stmts, NEGATE_EXPR,
9381                                      vectype, vec_init);
9382         /* The encoding has 2 interleaved stepped patterns.  */
9383         vec_perm_builder sel (nunits, 2, 3);
9384         sel.quick_grow (6);
9385         for (i = 0; i < 3; i++)
9386           {
9387             sel[2 * i] = i;
9388             sel[2 * i + 1] = i + nunits;
9389           }
9390         vec_perm_indices indices (sel, 2, nunits);
9391         /* Don't use vect_gen_perm_mask_checked since can_vec_perm_const_p may
9392            fail when vec_init is const vector. In that situation vec_perm is not
9393            really needed.  */
9394         tree perm_mask_even
9395           = vect_gen_perm_mask_any (vectype, indices);
9396         vec_init = gimple_build (stmts, VEC_PERM_EXPR,
9397                                  vectype,
9398                                  vec_init, vec_neg,
9399                                  perm_mask_even);
9400       }
9401       break;
9402
9403     case vect_step_op_mul:
9404       {
9405         /* Use unsigned mult to avoid UD integer overflow.  */
9406         gcc_assert (nunits.is_constant (&const_nunits));
9407         tree utype = unsigned_type_for (itype);
9408         tree uvectype = build_vector_type (utype,
9409                                            TYPE_VECTOR_SUBPARTS (vectype));
9410         new_name = gimple_convert (stmts, utype, new_name);
9411         vec_init = gimple_build_vector_from_val (stmts,
9412                                                  uvectype,
9413                                                  new_name);
9414         tree_vector_builder elts (uvectype, const_nunits, 1);
9415         tree elt_step = build_one_cst (utype);
9416
9417         elts.quick_push (elt_step);
9418         for (i = 1; i < const_nunits; i++)
9419           {
9420             /* Create: new_name_i = new_name + step_expr.  */
9421             elt_step = gimple_build (stmts, MULT_EXPR,
9422                                      utype, elt_step, step_expr);
9423             elts.quick_push (elt_step);
9424           }
9425         /* Create a vector from [new_name_0, new_name_1, ...,
9426            new_name_nunits-1].  */
9427         tree vec_mul = gimple_build_vector (stmts, &elts);
9428         vec_init = gimple_build (stmts, MULT_EXPR, uvectype,
9429                                  vec_init, vec_mul);
9430         vec_init = gimple_convert (stmts, vectype, vec_init);
9431       }
9432       break;
9433
9434     default:
9435       gcc_unreachable ();
9436     }
9437
9438   return vec_init;
9439 }
9440
9441 /* Peel init_expr by skip_niter for induction_type.  */
9442 tree
9443 vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9444                              tree skip_niters, tree step_expr,
9445                              enum vect_induction_op_type induction_type)
9446 {
9447   gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST);
9448   tree type = TREE_TYPE (init_expr);
9449   unsigned prec = TYPE_PRECISION (type);
9450   switch (induction_type)
9451     {
9452     case vect_step_op_neg:
9453       if (TREE_INT_CST_LOW (skip_niters) % 2)
9454         init_expr = gimple_build (stmts, NEGATE_EXPR, type, init_expr);
9455       /* else no change.  */
9456       break;
9457
9458     case vect_step_op_shr:
9459     case vect_step_op_shl:
9460       skip_niters = gimple_convert (stmts, type, skip_niters);
9461       step_expr = gimple_build (stmts, MULT_EXPR, type, step_expr, skip_niters);
9462       /* When shift mount >= precision, need to avoid UD.
9463          In the original loop, there's no UD, and according to semantic,
9464          init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr.  */
9465       if (!tree_fits_uhwi_p (step_expr)
9466           || tree_to_uhwi (step_expr) >= prec)
9467         {
9468           if (induction_type == vect_step_op_shl
9469               || TYPE_UNSIGNED (type))
9470             init_expr = build_zero_cst (type);
9471           else
9472             init_expr = gimple_build (stmts, RSHIFT_EXPR, type,
9473                                       init_expr,
9474                                       wide_int_to_tree (type, prec - 1));
9475         }
9476       else
9477         init_expr = gimple_build (stmts, (induction_type == vect_step_op_shr
9478                                           ? RSHIFT_EXPR : LSHIFT_EXPR),
9479                                   type, init_expr, step_expr);
9480       break;
9481
9482     case vect_step_op_mul:
9483       {
9484         tree utype = unsigned_type_for (type);
9485         init_expr = gimple_convert (stmts, utype, init_expr);
9486         wide_int skipn = wi::to_wide (skip_niters);
9487         wide_int begin = wi::to_wide (step_expr);
9488         auto_mpz base, exp, mod, res;
9489         wi::to_mpz (begin, base, TYPE_SIGN (type));
9490         wi::to_mpz (skipn, exp, UNSIGNED);
9491         mpz_ui_pow_ui (mod, 2, TYPE_PRECISION (type));
9492         mpz_powm (res, base, exp, mod);
9493         begin = wi::from_mpz (type, res, TYPE_SIGN (type));
9494         tree mult_expr = wide_int_to_tree (utype, begin);
9495         init_expr = gimple_build (stmts, MULT_EXPR, utype,
9496                                   init_expr, mult_expr);
9497         init_expr = gimple_convert (stmts, type, init_expr);
9498       }
9499       break;
9500
9501     default:
9502       gcc_unreachable ();
9503     }
9504
9505   return init_expr;
9506 }
9507
9508 /* Create vector step for vectorized iv.  */
9509 static tree
9510 vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
9511                                poly_uint64 vf,
9512                                enum vect_induction_op_type induction_type)
9513 {
9514   tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
9515   tree new_name = NULL;
9516   /* Step should be pow (step, vf) for mult induction.  */
9517   if (induction_type == vect_step_op_mul)
9518     {
9519       gcc_assert (vf.is_constant ());
9520       wide_int begin = wi::to_wide (step_expr);
9521
9522       for (unsigned i = 0; i != vf.to_constant () - 1; i++)
9523         begin = wi::mul (begin, wi::to_wide (step_expr));
9524
9525       new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin);
9526     }
9527   else if (induction_type == vect_step_op_neg)
9528     /* Do nothing.  */
9529     ;
9530   else
9531     new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr),
9532                              expr, step_expr);
9533   return new_name;
9534 }
9535
9536 static tree
9537 vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
9538                                    stmt_vec_info stmt_info,
9539                                    tree new_name, tree vectype,
9540                                    enum vect_induction_op_type induction_type)
9541 {
9542   /* No step is needed for neg induction.  */
9543   if (induction_type == vect_step_op_neg)
9544     return NULL;
9545
9546   tree t = unshare_expr (new_name);
9547   gcc_assert (CONSTANT_CLASS_P (new_name)
9548               || TREE_CODE (new_name) == SSA_NAME);
9549   tree new_vec = build_vector_from_val (vectype, t);
9550   tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
9551                                     new_vec, vectype, NULL);
9552   return vec_step;
9553 }
9554
9555 /* Update vectorized iv with vect_step, induc_def is init.  */
9556 static tree
9557 vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
9558                           tree induc_def, tree vec_step,
9559                           enum vect_induction_op_type induction_type)
9560 {
9561   tree vec_def = induc_def;
9562   switch (induction_type)
9563     {
9564     case vect_step_op_mul:
9565       {
9566         /* Use unsigned mult to avoid UD integer overflow.  */
9567         tree uvectype
9568           = build_vector_type (unsigned_type_for (TREE_TYPE (vectype)),
9569                                TYPE_VECTOR_SUBPARTS (vectype));
9570         vec_def = gimple_convert (stmts, uvectype, vec_def);
9571         vec_step = gimple_convert (stmts, uvectype, vec_step);
9572         vec_def = gimple_build (stmts, MULT_EXPR, uvectype,
9573                                 vec_def, vec_step);
9574         vec_def = gimple_convert (stmts, vectype, vec_def);
9575       }
9576       break;
9577
9578     case vect_step_op_shr:
9579       vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype,
9580                               vec_def, vec_step);
9581       break;
9582
9583     case vect_step_op_shl:
9584       vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype,
9585                               vec_def, vec_step);
9586       break;
9587     case vect_step_op_neg:
9588       vec_def = induc_def;
9589       /* Do nothing.  */
9590       break;
9591     default:
9592       gcc_unreachable ();
9593     }
9594
9595   return vec_def;
9596
9597 }
9598
9599 /* Function vectorizable_induction
9600
9601    Check if STMT_INFO performs an nonlinear induction computation that can be
9602    vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
9603    a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
9604    basic block.
9605    Return true if STMT_INFO is vectorizable in this way.  */
9606
9607 static bool
9608 vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
9609                                   stmt_vec_info stmt_info,
9610                                   gimple **vec_stmt, slp_tree slp_node,
9611                                   stmt_vector_for_cost *cost_vec)
9612 {
9613   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9614   unsigned ncopies;
9615   bool nested_in_vect_loop = false;
9616   class loop *iv_loop;
9617   tree vec_def;
9618   edge pe = loop_preheader_edge (loop);
9619   basic_block new_bb;
9620   tree vec_init, vec_step;
9621   tree new_name;
9622   gimple *new_stmt;
9623   gphi *induction_phi;
9624   tree induc_def, vec_dest;
9625   tree init_expr, step_expr;
9626   tree niters_skip;
9627   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9628   unsigned i;
9629   gimple_stmt_iterator si;
9630
9631   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9632
9633   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9634   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9635   enum vect_induction_op_type induction_type
9636     = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9637
9638   gcc_assert (induction_type > vect_step_op_add);
9639
9640   if (slp_node)
9641     ncopies = 1;
9642   else
9643     ncopies = vect_get_num_copies (loop_vinfo, vectype);
9644   gcc_assert (ncopies >= 1);
9645
9646   /* FORNOW. Only handle nonlinear induction in the same loop.  */
9647   if (nested_in_vect_loop_p (loop, stmt_info))
9648     {
9649       if (dump_enabled_p ())
9650         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9651                          "nonlinear induction in nested loop.\n");
9652       return false;
9653     }
9654
9655   iv_loop = loop;
9656   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9657
9658   /* TODO: Support slp for nonlinear iv. There should be separate vector iv
9659      update for each iv and a permutation to generate wanted vector iv.  */
9660   if (slp_node)
9661     {
9662       if (dump_enabled_p ())
9663         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9664                          "SLP induction not supported for nonlinear"
9665                          " induction.\n");
9666       return false;
9667     }
9668
9669   if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
9670     {
9671       if (dump_enabled_p ())
9672         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9673                          "floating point nonlinear induction vectorization"
9674                          " not supported.\n");
9675       return false;
9676     }
9677
9678   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9679   init_expr = vect_phi_initial_value (phi);
9680   gcc_assert (step_expr != NULL_TREE && init_expr != NULL
9681               && TREE_CODE (step_expr) == INTEGER_CST);
9682   /* step_expr should be aligned with init_expr,
9683      .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used.  */
9684   step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
9685
9686   if (TREE_CODE (init_expr) == INTEGER_CST)
9687     init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
9688   else if (!tree_nop_conversion_p (TREE_TYPE (vectype), TREE_TYPE (init_expr)))
9689     {
9690       /* INIT_EXPR could be a bit_field, bail out for such case.  */
9691       if (dump_enabled_p ())
9692         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9693                          "nonlinear induction vectorization failed:"
9694                          " component type of vectype is not a nop conversion"
9695                          " from type of init_expr.\n");
9696       return false;
9697     }
9698
9699   switch (induction_type)
9700     {
9701     case vect_step_op_neg:
9702       if (TREE_CODE (init_expr) != INTEGER_CST
9703           && TREE_CODE (init_expr) != REAL_CST)
9704         {
9705           /* Check for backend support of NEGATE_EXPR and vec_perm.  */
9706           if (!directly_supported_p (NEGATE_EXPR, vectype))
9707             return false;
9708
9709           /* The encoding has 2 interleaved stepped patterns.  */
9710           vec_perm_builder sel (nunits, 2, 3);
9711           machine_mode mode = TYPE_MODE (vectype);
9712           sel.quick_grow (6);
9713           for (i = 0; i < 3; i++)
9714             {
9715               sel[i * 2] = i;
9716               sel[i * 2 + 1] = i + nunits;
9717             }
9718           vec_perm_indices indices (sel, 2, nunits);
9719           if (!can_vec_perm_const_p (mode, mode, indices))
9720             return false;
9721         }
9722       break;
9723
9724     case vect_step_op_mul:
9725       {
9726         /* Check for backend support of MULT_EXPR.  */
9727         if (!directly_supported_p (MULT_EXPR, vectype))
9728           return false;
9729
9730         /* ?? How to construct vector step for variable number vector.
9731            [ 1, step, pow (step, 2), pow (step, 4), .. ].  */
9732         if (!vf.is_constant ())
9733           return false;
9734       }
9735       break;
9736
9737     case vect_step_op_shr:
9738       /* Check for backend support of RSHIFT_EXPR.  */
9739       if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
9740         return false;
9741
9742       /* Don't shift more than type precision to avoid UD.  */
9743       if (!tree_fits_uhwi_p (step_expr)
9744           || maybe_ge (nunits * tree_to_uhwi (step_expr),
9745                        TYPE_PRECISION (TREE_TYPE (init_expr))))
9746         return false;
9747       break;
9748
9749     case vect_step_op_shl:
9750       /* Check for backend support of RSHIFT_EXPR.  */
9751       if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
9752         return false;
9753
9754       /* Don't shift more than type precision to avoid UD.  */
9755       if (!tree_fits_uhwi_p (step_expr)
9756           || maybe_ge (nunits * tree_to_uhwi (step_expr),
9757                        TYPE_PRECISION (TREE_TYPE (init_expr))))
9758         return false;
9759
9760       break;
9761
9762     default:
9763       gcc_unreachable ();
9764     }
9765
9766   if (!vec_stmt) /* transformation not required.  */
9767     {
9768       unsigned inside_cost = 0, prologue_cost = 0;
9769       /* loop cost for vec_loop. Neg induction doesn't have any
9770          inside_cost.  */
9771       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9772                                       stmt_info, 0, vect_body);
9773
9774       /* loop cost for vec_loop. Neg induction doesn't have any
9775          inside_cost.  */
9776       if (induction_type == vect_step_op_neg)
9777         inside_cost = 0;
9778
9779       /* prologue cost for vec_init and vec_step.  */
9780       prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9781                                         stmt_info, 0, vect_prologue);
9782
9783       if (dump_enabled_p ())
9784         dump_printf_loc (MSG_NOTE, vect_location,
9785                          "vect_model_induction_cost: inside_cost = %d, "
9786                          "prologue_cost = %d. \n", inside_cost,
9787                          prologue_cost);
9788
9789       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9790       DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
9791       return true;
9792     }
9793
9794   /* Transform.  */
9795
9796   /* Compute a vector variable, initialized with the first VF values of
9797      the induction variable.  E.g., for an iv with IV_PHI='X' and
9798      evolution S, for a vector of 4 units, we want to compute:
9799      [X, X + S, X + 2*S, X + 3*S].  */
9800
9801   if (dump_enabled_p ())
9802     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9803
9804   pe = loop_preheader_edge (iv_loop);
9805   /* Find the first insertion point in the BB.  */
9806   basic_block bb = gimple_bb (phi);
9807   si = gsi_after_labels (bb);
9808
9809   gimple_seq stmts = NULL;
9810
9811   niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9812   /* If we are using the loop mask to "peel" for alignment then we need
9813      to adjust the start value here.  */
9814   if (niters_skip != NULL_TREE)
9815     init_expr = vect_peel_nonlinear_iv_init (&stmts, init_expr, niters_skip,
9816                                              step_expr, induction_type);
9817
9818   vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr,
9819                                             step_expr, nunits, vectype,
9820                                             induction_type);
9821   if (stmts)
9822     {
9823       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9824       gcc_assert (!new_bb);
9825     }
9826
9827   stmts = NULL;
9828   new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9829                                             vf, induction_type);
9830   if (stmts)
9831     {
9832       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9833       gcc_assert (!new_bb);
9834     }
9835
9836   vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9837                                                 new_name, vectype,
9838                                                 induction_type);
9839   /* Create the following def-use cycle:
9840      loop prolog:
9841      vec_init = ...
9842      vec_step = ...
9843      loop:
9844      vec_iv = PHI <vec_init, vec_loop>
9845      ...
9846      STMT
9847      ...
9848      vec_loop = vec_iv + vec_step;  */
9849
9850   /* Create the induction-phi that defines the induction-operand.  */
9851   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
9852   induction_phi = create_phi_node (vec_dest, iv_loop->header);
9853   induc_def = PHI_RESULT (induction_phi);
9854
9855   /* Create the iv update inside the loop.  */
9856   stmts = NULL;
9857   vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9858                                       induc_def, vec_step,
9859                                       induction_type);
9860
9861   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9862   new_stmt = SSA_NAME_DEF_STMT (vec_def);
9863
9864   /* Set the arguments of the phi node:  */
9865   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9866   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9867                UNKNOWN_LOCATION);
9868
9869   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
9870   *vec_stmt = induction_phi;
9871
9872   /* In case that vectorization factor (VF) is bigger than the number
9873      of elements that we can fit in a vectype (nunits), we have to generate
9874      more than one vector stmt - i.e - we need to "unroll" the
9875      vector stmt by a factor VF/nunits.  For more details see documentation
9876      in vectorizable_operation.  */
9877
9878   if (ncopies > 1)
9879     {
9880       stmts = NULL;
9881       /* FORNOW. This restriction should be relaxed.  */
9882       gcc_assert (!nested_in_vect_loop);
9883
9884       new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9885                                                 nunits, induction_type);
9886
9887       vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9888                                                     new_name, vectype,
9889                                                     induction_type);
9890       vec_def = induc_def;
9891       for (i = 1; i < ncopies; i++)
9892         {
9893           /* vec_i = vec_prev + vec_step.  */
9894           stmts = NULL;
9895           vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9896                                               vec_def, vec_step,
9897                                               induction_type);
9898           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9899           new_stmt = SSA_NAME_DEF_STMT (vec_def);
9900           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
9901         }
9902     }
9903
9904   if (dump_enabled_p ())
9905     dump_printf_loc (MSG_NOTE, vect_location,
9906                      "transform induction: created def-use cycle: %G%G",
9907                      (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
9908
9909   return true;
9910 }
9911
9912 /* Function vectorizable_induction
9913
9914    Check if STMT_INFO performs an induction computation that can be vectorized.
9915    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
9916    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
9917    Return true if STMT_INFO is vectorizable in this way.  */
9918
9919 bool
9920 vectorizable_induction (loop_vec_info loop_vinfo,
9921                         stmt_vec_info stmt_info,
9922                         gimple **vec_stmt, slp_tree slp_node,
9923                         stmt_vector_for_cost *cost_vec)
9924 {
9925   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9926   unsigned ncopies;
9927   bool nested_in_vect_loop = false;
9928   class loop *iv_loop;
9929   tree vec_def;
9930   edge pe = loop_preheader_edge (loop);
9931   basic_block new_bb;
9932   tree new_vec, vec_init, vec_step, t;
9933   tree new_name;
9934   gimple *new_stmt;
9935   gphi *induction_phi;
9936   tree induc_def, vec_dest;
9937   tree init_expr, step_expr;
9938   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9939   unsigned i;
9940   tree expr;
9941   gimple_stmt_iterator si;
9942   enum vect_induction_op_type induction_type
9943     = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9944
9945   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9946   if (!phi)
9947     return false;
9948
9949   if (!STMT_VINFO_RELEVANT_P (stmt_info))
9950     return false;
9951
9952   /* Make sure it was recognized as induction computation.  */
9953   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
9954     return false;
9955
9956   /* Handle nonlinear induction in a separate place.  */
9957   if (induction_type != vect_step_op_add)
9958     return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
9959                                              vec_stmt, slp_node, cost_vec);
9960
9961   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9962   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9963
9964   if (slp_node)
9965     ncopies = 1;
9966   else
9967     ncopies = vect_get_num_copies (loop_vinfo, vectype);
9968   gcc_assert (ncopies >= 1);
9969
9970   /* FORNOW. These restrictions should be relaxed.  */
9971   if (nested_in_vect_loop_p (loop, stmt_info))
9972     {
9973       imm_use_iterator imm_iter;
9974       use_operand_p use_p;
9975       gimple *exit_phi;
9976       edge latch_e;
9977       tree loop_arg;
9978
9979       if (ncopies > 1)
9980         {
9981           if (dump_enabled_p ())
9982             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9983                              "multiple types in nested loop.\n");
9984           return false;
9985         }
9986
9987       exit_phi = NULL;
9988       latch_e = loop_latch_edge (loop->inner);
9989       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
9990       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
9991         {
9992           gimple *use_stmt = USE_STMT (use_p);
9993           if (is_gimple_debug (use_stmt))
9994             continue;
9995
9996           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
9997             {
9998               exit_phi = use_stmt;
9999               break;
10000             }
10001         }
10002       if (exit_phi)
10003         {
10004           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
10005           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
10006                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
10007             {
10008               if (dump_enabled_p ())
10009                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10010                                  "inner-loop induction only used outside "
10011                                  "of the outer vectorized loop.\n");
10012               return false;
10013             }
10014         }
10015
10016       nested_in_vect_loop = true;
10017       iv_loop = loop->inner;
10018     }
10019   else
10020     iv_loop = loop;
10021   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
10022
10023   if (slp_node && !nunits.is_constant ())
10024     {
10025       /* The current SLP code creates the step value element-by-element.  */
10026       if (dump_enabled_p ())
10027         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10028                          "SLP induction not supported for variable-length"
10029                          " vectors.\n");
10030       return false;
10031     }
10032
10033   if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
10034     {
10035       if (dump_enabled_p ())
10036         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10037                          "floating point induction vectorization disabled\n");
10038       return false;
10039     }
10040
10041   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
10042   gcc_assert (step_expr != NULL_TREE);
10043   if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
10044       && !type_has_mode_precision_p (TREE_TYPE (step_expr)))
10045     {
10046       if (dump_enabled_p ())
10047         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10048                          "bit-precision induction vectorization not "
10049                          "supported.\n");
10050       return false;
10051     }
10052   tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
10053
10054   /* Check for backend support of PLUS/MINUS_EXPR. */
10055   if (!directly_supported_p (PLUS_EXPR, step_vectype)
10056       || !directly_supported_p (MINUS_EXPR, step_vectype))
10057     return false;
10058
10059   if (!vec_stmt) /* transformation not required.  */
10060     {
10061       unsigned inside_cost = 0, prologue_cost = 0;
10062       if (slp_node)
10063         {
10064           /* We eventually need to set a vector type on invariant
10065              arguments.  */
10066           unsigned j;
10067           slp_tree child;
10068           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
10069             if (!vect_maybe_update_slp_op_vectype
10070                 (child, SLP_TREE_VECTYPE (slp_node)))
10071               {
10072                 if (dump_enabled_p ())
10073                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10074                                    "incompatible vector types for "
10075                                    "invariants\n");
10076                 return false;
10077               }
10078           /* loop cost for vec_loop.  */
10079           inside_cost
10080             = record_stmt_cost (cost_vec,
10081                                 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
10082                                 vector_stmt, stmt_info, 0, vect_body);
10083           /* prologue cost for vec_init (if not nested) and step.  */
10084           prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
10085                                             scalar_to_vec,
10086                                             stmt_info, 0, vect_prologue);
10087         }
10088       else /* if (!slp_node) */
10089         {
10090           /* loop cost for vec_loop.  */
10091           inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
10092                                           stmt_info, 0, vect_body);
10093           /* prologue cost for vec_init and vec_step.  */
10094           prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
10095                                             stmt_info, 0, vect_prologue);
10096         }
10097       if (dump_enabled_p ())
10098         dump_printf_loc (MSG_NOTE, vect_location,
10099                          "vect_model_induction_cost: inside_cost = %d, "
10100                          "prologue_cost = %d .\n", inside_cost,
10101                          prologue_cost);
10102
10103       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
10104       DUMP_VECT_SCOPE ("vectorizable_induction");
10105       return true;
10106     }
10107
10108   /* Transform.  */
10109
10110   /* Compute a vector variable, initialized with the first VF values of
10111      the induction variable.  E.g., for an iv with IV_PHI='X' and
10112      evolution S, for a vector of 4 units, we want to compute:
10113      [X, X + S, X + 2*S, X + 3*S].  */
10114
10115   if (dump_enabled_p ())
10116     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
10117
10118   pe = loop_preheader_edge (iv_loop);
10119   /* Find the first insertion point in the BB.  */
10120   basic_block bb = gimple_bb (phi);
10121   si = gsi_after_labels (bb);
10122
10123   /* For SLP induction we have to generate several IVs as for example
10124      with group size 3 we need
10125        [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
10126        [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2].  */
10127   if (slp_node)
10128     {
10129       /* Enforced above.  */
10130       unsigned int const_nunits = nunits.to_constant ();
10131
10132       /* The initial values are vectorized, but any lanes > group_size
10133          need adjustment.  */
10134       slp_tree init_node
10135         = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
10136
10137       /* Gather steps.  Since we do not vectorize inductions as
10138          cycles we have to reconstruct the step from SCEV data.  */
10139       unsigned group_size = SLP_TREE_LANES (slp_node);
10140       tree *steps = XALLOCAVEC (tree, group_size);
10141       tree *inits = XALLOCAVEC (tree, group_size);
10142       stmt_vec_info phi_info;
10143       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
10144         {
10145           steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
10146           if (!init_node)
10147             inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
10148                                            pe->dest_idx);
10149         }
10150
10151       /* Now generate the IVs.  */
10152       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10153       gcc_assert ((const_nunits * nvects) % group_size == 0);
10154       unsigned nivs;
10155       if (nested_in_vect_loop)
10156         nivs = nvects;
10157       else
10158         {
10159           /* Compute the number of distinct IVs we need.  First reduce
10160              group_size if it is a multiple of const_nunits so we get
10161              one IV for a group_size of 4 but const_nunits 2.  */
10162           unsigned group_sizep = group_size;
10163           if (group_sizep % const_nunits == 0)
10164             group_sizep = group_sizep / const_nunits;
10165           nivs = least_common_multiple (group_sizep,
10166                                         const_nunits) / const_nunits;
10167         }
10168       tree stept = TREE_TYPE (step_vectype);
10169       tree lupdate_mul = NULL_TREE;
10170       if (!nested_in_vect_loop)
10171         {
10172           /* The number of iterations covered in one vector iteration.  */
10173           unsigned lup_mul = (nvects * const_nunits) / group_size;
10174           lupdate_mul
10175             = build_vector_from_val (step_vectype,
10176                                      SCALAR_FLOAT_TYPE_P (stept)
10177                                      ? build_real_from_wide (stept, lup_mul,
10178                                                              UNSIGNED)
10179                                      : build_int_cstu (stept, lup_mul));
10180         }
10181       tree peel_mul = NULL_TREE;
10182       gimple_seq init_stmts = NULL;
10183       if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
10184         {
10185           if (SCALAR_FLOAT_TYPE_P (stept))
10186             peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
10187                                      LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
10188           else
10189             peel_mul = gimple_convert (&init_stmts, stept,
10190                                        LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
10191           peel_mul = gimple_build_vector_from_val (&init_stmts,
10192                                                    step_vectype, peel_mul);
10193         }
10194       unsigned ivn;
10195       auto_vec<tree> vec_steps;
10196       for (ivn = 0; ivn < nivs; ++ivn)
10197         {
10198           tree_vector_builder step_elts (step_vectype, const_nunits, 1);
10199           tree_vector_builder init_elts (vectype, const_nunits, 1);
10200           tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
10201           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
10202             {
10203               /* The scalar steps of the IVs.  */
10204               tree elt = steps[(ivn*const_nunits + eltn) % group_size];
10205               elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
10206               step_elts.quick_push (elt);
10207               if (!init_node)
10208                 {
10209                   /* The scalar inits of the IVs if not vectorized.  */
10210                   elt = inits[(ivn*const_nunits + eltn) % group_size];
10211                   if (!useless_type_conversion_p (TREE_TYPE (vectype),
10212                                                   TREE_TYPE (elt)))
10213                     elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
10214                                         TREE_TYPE (vectype), elt);
10215                   init_elts.quick_push (elt);
10216                 }
10217               /* The number of steps to add to the initial values.  */
10218               unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
10219               mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
10220                                    ? build_real_from_wide (stept,
10221                                                            mul_elt, UNSIGNED)
10222                                    : build_int_cstu (stept, mul_elt));
10223             }
10224           vec_step = gimple_build_vector (&init_stmts, &step_elts);
10225           vec_steps.safe_push (vec_step);
10226           tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
10227           if (peel_mul)
10228             step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
10229                                      step_mul, peel_mul);
10230           if (!init_node)
10231             vec_init = gimple_build_vector (&init_stmts, &init_elts);
10232
10233           /* Create the induction-phi that defines the induction-operand.  */
10234           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
10235                                             "vec_iv_");
10236           induction_phi = create_phi_node (vec_dest, iv_loop->header);
10237           induc_def = PHI_RESULT (induction_phi);
10238
10239           /* Create the iv update inside the loop  */
10240           tree up = vec_step;
10241           if (lupdate_mul)
10242             up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10243                                vec_step, lupdate_mul);
10244           gimple_seq stmts = NULL;
10245           vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10246           vec_def = gimple_build (&stmts,
10247                                   PLUS_EXPR, step_vectype, vec_def, up);
10248           vec_def = gimple_convert (&stmts, vectype, vec_def);
10249           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10250           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10251                        UNKNOWN_LOCATION);
10252
10253           if (init_node)
10254             vec_init = vect_get_slp_vect_def (init_node, ivn);
10255           if (!nested_in_vect_loop
10256               && !integer_zerop (step_mul))
10257             {
10258               vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
10259               up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10260                                  vec_step, step_mul);
10261               vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
10262                                       vec_def, up);
10263               vec_init = gimple_convert (&init_stmts, vectype, vec_def);
10264             }
10265
10266           /* Set the arguments of the phi node:  */
10267           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10268
10269           slp_node->push_vec_def (induction_phi);
10270         }
10271       if (!nested_in_vect_loop)
10272         {
10273           /* Fill up to the number of vectors we need for the whole group.  */
10274           nivs = least_common_multiple (group_size,
10275                                         const_nunits) / const_nunits;
10276           vec_steps.reserve (nivs-ivn);
10277           for (; ivn < nivs; ++ivn)
10278             {
10279               slp_node->push_vec_def (SLP_TREE_VEC_DEFS (slp_node)[0]);
10280               vec_steps.quick_push (vec_steps[0]);
10281             }
10282         }
10283
10284       /* Re-use IVs when we can.  We are generating further vector
10285          stmts by adding VF' * stride to the IVs generated above.  */
10286       if (ivn < nvects)
10287         {
10288           unsigned vfp
10289             = least_common_multiple (group_size, const_nunits) / group_size;
10290           tree lupdate_mul
10291             = build_vector_from_val (step_vectype,
10292                                      SCALAR_FLOAT_TYPE_P (stept)
10293                                      ? build_real_from_wide (stept,
10294                                                              vfp, UNSIGNED)
10295                                      : build_int_cstu (stept, vfp));
10296           for (; ivn < nvects; ++ivn)
10297             {
10298               gimple *iv
10299                 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (slp_node)[ivn - nivs]);
10300               tree def = gimple_get_lhs (iv);
10301               if (ivn < 2*nivs)
10302                 vec_steps[ivn - nivs]
10303                   = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10304                                   vec_steps[ivn - nivs], lupdate_mul);
10305               gimple_seq stmts = NULL;
10306               def = gimple_convert (&stmts, step_vectype, def);
10307               def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10308                                   def, vec_steps[ivn % nivs]);
10309               def = gimple_convert (&stmts, vectype, def);
10310               if (gimple_code (iv) == GIMPLE_PHI)
10311                 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10312               else
10313                 {
10314                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
10315                   gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
10316                 }
10317               slp_node->push_vec_def (def);
10318             }
10319         }
10320
10321       new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
10322       gcc_assert (!new_bb);
10323
10324       return true;
10325     }
10326
10327   init_expr = vect_phi_initial_value (phi);
10328
10329   gimple_seq stmts = NULL;
10330   if (!nested_in_vect_loop)
10331     {
10332       /* Convert the initial value to the IV update type.  */
10333       tree new_type = TREE_TYPE (step_expr);
10334       init_expr = gimple_convert (&stmts, new_type, init_expr);
10335
10336       /* If we are using the loop mask to "peel" for alignment then we need
10337          to adjust the start value here.  */
10338       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
10339       if (skip_niters != NULL_TREE)
10340         {
10341           if (FLOAT_TYPE_P (vectype))
10342             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
10343                                         skip_niters);
10344           else
10345             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
10346           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
10347                                          skip_niters, step_expr);
10348           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
10349                                     init_expr, skip_step);
10350         }
10351     }
10352
10353   if (stmts)
10354     {
10355       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10356       gcc_assert (!new_bb);
10357     }
10358
10359   /* Create the vector that holds the initial_value of the induction.  */
10360   if (nested_in_vect_loop)
10361     {
10362       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
10363          been created during vectorization of previous stmts.  We obtain it
10364          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
10365       auto_vec<tree> vec_inits;
10366       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
10367                                      init_expr, &vec_inits);
10368       vec_init = vec_inits[0];
10369       /* If the initial value is not of proper type, convert it.  */
10370       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
10371         {
10372           new_stmt
10373             = gimple_build_assign (vect_get_new_ssa_name (vectype,
10374                                                           vect_simple_var,
10375                                                           "vec_iv_"),
10376                                    VIEW_CONVERT_EXPR,
10377                                    build1 (VIEW_CONVERT_EXPR, vectype,
10378                                            vec_init));
10379           vec_init = gimple_assign_lhs (new_stmt);
10380           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
10381                                                  new_stmt);
10382           gcc_assert (!new_bb);
10383         }
10384     }
10385   else
10386     {
10387       /* iv_loop is the loop to be vectorized. Create:
10388          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
10389       stmts = NULL;
10390       new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
10391
10392       unsigned HOST_WIDE_INT const_nunits;
10393       if (nunits.is_constant (&const_nunits))
10394         {
10395           tree_vector_builder elts (step_vectype, const_nunits, 1);
10396           elts.quick_push (new_name);
10397           for (i = 1; i < const_nunits; i++)
10398             {
10399               /* Create: new_name_i = new_name + step_expr  */
10400               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
10401                                        new_name, step_expr);
10402               elts.quick_push (new_name);
10403             }
10404           /* Create a vector from [new_name_0, new_name_1, ...,
10405              new_name_nunits-1]  */
10406           vec_init = gimple_build_vector (&stmts, &elts);
10407         }
10408       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
10409         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
10410         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
10411                                  new_name, step_expr);
10412       else
10413         {
10414           /* Build:
10415                 [base, base, base, ...]
10416                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
10417           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
10418           gcc_assert (flag_associative_math);
10419           tree index = build_index_vector (step_vectype, 0, 1);
10420           tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10421                                                         new_name);
10422           tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10423                                                         step_expr);
10424           vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
10425           vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
10426                                    vec_init, step_vec);
10427           vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10428                                    vec_init, base_vec);
10429         }
10430       vec_init = gimple_convert (&stmts, vectype, vec_init);
10431
10432       if (stmts)
10433         {
10434           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10435           gcc_assert (!new_bb);
10436         }
10437     }
10438
10439
10440   /* Create the vector that holds the step of the induction.  */
10441   gimple_stmt_iterator *step_iv_si = NULL;
10442   if (nested_in_vect_loop)
10443     /* iv_loop is nested in the loop to be vectorized. Generate:
10444        vec_step = [S, S, S, S]  */
10445     new_name = step_expr;
10446   else if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
10447     {
10448       /* When we're using loop_len produced by SELEC_VL, the non-final
10449          iterations are not always processing VF elements.  So vectorize
10450          induction variable instead of
10451
10452            _21 = vect_vec_iv_.6_22 + { VF, ... };
10453
10454          We should generate:
10455
10456            _35 = .SELECT_VL (ivtmp_33, VF);
10457            vect_cst__22 = [vec_duplicate_expr] _35;
10458            _21 = vect_vec_iv_.6_22 + vect_cst__22;  */
10459       gcc_assert (!slp_node);
10460       gimple_seq seq = NULL;
10461       vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
10462       tree len = vect_get_loop_len (loop_vinfo, NULL, lens, 1, vectype, 0, 0);
10463       expr = force_gimple_operand (fold_convert (TREE_TYPE (step_expr),
10464                                                  unshare_expr (len)),
10465                                    &seq, true, NULL_TREE);
10466       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr), expr,
10467                                step_expr);
10468       gsi_insert_seq_before (&si, seq, GSI_SAME_STMT);
10469       step_iv_si = &si;
10470     }
10471   else
10472     {
10473       /* iv_loop is the loop to be vectorized. Generate:
10474           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
10475       gimple_seq seq = NULL;
10476       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10477         {
10478           expr = build_int_cst (integer_type_node, vf);
10479           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10480         }
10481       else
10482         expr = build_int_cst (TREE_TYPE (step_expr), vf);
10483       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10484                                expr, step_expr);
10485       if (seq)
10486         {
10487           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10488           gcc_assert (!new_bb);
10489         }
10490     }
10491
10492   t = unshare_expr (new_name);
10493   gcc_assert (CONSTANT_CLASS_P (new_name)
10494               || TREE_CODE (new_name) == SSA_NAME);
10495   new_vec = build_vector_from_val (step_vectype, t);
10496   vec_step = vect_init_vector (loop_vinfo, stmt_info,
10497                                new_vec, step_vectype, step_iv_si);
10498
10499
10500   /* Create the following def-use cycle:
10501      loop prolog:
10502          vec_init = ...
10503          vec_step = ...
10504      loop:
10505          vec_iv = PHI <vec_init, vec_loop>
10506          ...
10507          STMT
10508          ...
10509          vec_loop = vec_iv + vec_step;  */
10510
10511   /* Create the induction-phi that defines the induction-operand.  */
10512   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
10513   induction_phi = create_phi_node (vec_dest, iv_loop->header);
10514   induc_def = PHI_RESULT (induction_phi);
10515
10516   /* Create the iv update inside the loop  */
10517   stmts = NULL;
10518   vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10519   vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
10520   vec_def = gimple_convert (&stmts, vectype, vec_def);
10521   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10522   new_stmt = SSA_NAME_DEF_STMT (vec_def);
10523
10524   /* Set the arguments of the phi node:  */
10525   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10526   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10527                UNKNOWN_LOCATION);
10528
10529   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
10530   *vec_stmt = induction_phi;
10531
10532   /* In case that vectorization factor (VF) is bigger than the number
10533      of elements that we can fit in a vectype (nunits), we have to generate
10534      more than one vector stmt - i.e - we need to "unroll" the
10535      vector stmt by a factor VF/nunits.  For more details see documentation
10536      in vectorizable_operation.  */
10537
10538   if (ncopies > 1)
10539     {
10540       gimple_seq seq = NULL;
10541       /* FORNOW. This restriction should be relaxed.  */
10542       gcc_assert (!nested_in_vect_loop);
10543       /* We expect LOOP_VINFO_USING_SELECT_VL_P to be false if ncopies > 1.  */
10544       gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
10545
10546       /* Create the vector that holds the step of the induction.  */
10547       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10548         {
10549           expr = build_int_cst (integer_type_node, nunits);
10550           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10551         }
10552       else
10553         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
10554       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10555                                expr, step_expr);
10556       if (seq)
10557         {
10558           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10559           gcc_assert (!new_bb);
10560         }
10561
10562       t = unshare_expr (new_name);
10563       gcc_assert (CONSTANT_CLASS_P (new_name)
10564                   || TREE_CODE (new_name) == SSA_NAME);
10565       new_vec = build_vector_from_val (step_vectype, t);
10566       vec_step = vect_init_vector (loop_vinfo, stmt_info,
10567                                    new_vec, step_vectype, NULL);
10568
10569       vec_def = induc_def;
10570       for (i = 1; i < ncopies + 1; i++)
10571         {
10572           /* vec_i = vec_prev + vec_step  */
10573           gimple_seq stmts = NULL;
10574           vec_def = gimple_convert (&stmts, step_vectype, vec_def);
10575           vec_def = gimple_build (&stmts,
10576                                   PLUS_EXPR, step_vectype, vec_def, vec_step);
10577           vec_def = gimple_convert (&stmts, vectype, vec_def);
10578
10579           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10580           if (i < ncopies)
10581             {
10582               new_stmt = SSA_NAME_DEF_STMT (vec_def);
10583               STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
10584             }
10585           else
10586             {
10587               /* vec_1 = vec_iv + (VF/n * S)
10588                  vec_2 = vec_1 + (VF/n * S)
10589                  ...
10590                  vec_n = vec_prev + (VF/n * S) = vec_iv + VF * S = vec_loop
10591
10592                  vec_n is used as vec_loop to save the large step register and
10593                  related operations.  */
10594               add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10595                            UNKNOWN_LOCATION);
10596             }
10597         }
10598     }
10599
10600   if (dump_enabled_p ())
10601     dump_printf_loc (MSG_NOTE, vect_location,
10602                      "transform induction: created def-use cycle: %G%G",
10603                      (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
10604
10605   return true;
10606 }
10607
10608 /* Function vectorizable_live_operation_1.
10609
10610    helper function for vectorizable_live_operation.  */
10611
10612 static tree
10613 vectorizable_live_operation_1 (loop_vec_info loop_vinfo,
10614                                stmt_vec_info stmt_info, basic_block exit_bb,
10615                                tree vectype, int ncopies, slp_tree slp_node,
10616                                tree bitsize, tree bitstart, tree vec_lhs,
10617                                tree lhs_type, gimple_stmt_iterator *exit_gsi)
10618 {
10619   gcc_assert (single_pred_p (exit_bb) || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
10620
10621   tree vec_lhs_phi = copy_ssa_name (vec_lhs);
10622   gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
10623   for (unsigned i = 0; i < gimple_phi_num_args (phi); i++)
10624     SET_PHI_ARG_DEF (phi, i, vec_lhs);
10625
10626   gimple_seq stmts = NULL;
10627   tree new_tree;
10628
10629   /* If bitstart is 0 then we can use a BIT_FIELD_REF  */
10630   if (integer_zerop (bitstart))
10631     {
10632       tree scalar_res = gimple_build (&stmts, BIT_FIELD_REF, TREE_TYPE (vectype),
10633                                       vec_lhs_phi, bitsize, bitstart);
10634
10635       /* Convert the extracted vector element to the scalar type.  */
10636       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10637     }
10638   else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
10639     {
10640       /* Emit:
10641
10642          SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
10643
10644          where VEC_LHS is the vectorized live-out result and MASK is
10645          the loop mask for the final iteration.  */
10646       gcc_assert (ncopies == 1 && !slp_node);
10647       gimple_seq tem = NULL;
10648       gimple_stmt_iterator gsi = gsi_last (tem);
10649       tree len = vect_get_loop_len (loop_vinfo, &gsi,
10650                                     &LOOP_VINFO_LENS (loop_vinfo),
10651                                     1, vectype, 0, 0);
10652
10653       /* BIAS - 1.  */
10654       signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
10655       tree bias_minus_one
10656         = int_const_binop (MINUS_EXPR,
10657                            build_int_cst (TREE_TYPE (len), biasval),
10658                            build_one_cst (TREE_TYPE (len)));
10659
10660       /* LAST_INDEX = LEN + (BIAS - 1).  */
10661       tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
10662                                      len, bias_minus_one);
10663
10664       /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>.  */
10665       tree scalar_res
10666         = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
10667                         vec_lhs_phi, last_index);
10668
10669       /* Convert the extracted vector element to the scalar type.  */
10670       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10671     }
10672   else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
10673     {
10674       /* Emit:
10675
10676          SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
10677
10678          where VEC_LHS is the vectorized live-out result and MASK is
10679          the loop mask for the final iteration.  */
10680       gcc_assert (!slp_node);
10681       tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
10682       gimple_seq tem = NULL;
10683       gimple_stmt_iterator gsi = gsi_last (tem);
10684       tree mask = vect_get_loop_mask (loop_vinfo, &gsi,
10685                                       &LOOP_VINFO_MASKS (loop_vinfo),
10686                                       1, vectype, 0);
10687       tree scalar_res;
10688       gimple_seq_add_seq (&stmts, tem);
10689
10690       scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
10691                                  mask, vec_lhs_phi);
10692
10693       /* Convert the extracted vector element to the scalar type.  */
10694       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10695     }
10696   else
10697     {
10698       tree bftype = TREE_TYPE (vectype);
10699       if (VECTOR_BOOLEAN_TYPE_P (vectype))
10700         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10701       new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs_phi, bitsize, bitstart);
10702       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10703                                        &stmts, true, NULL_TREE);
10704     }
10705
10706   *exit_gsi = gsi_after_labels (exit_bb);
10707   if (stmts)
10708     gsi_insert_seq_before (exit_gsi, stmts, GSI_SAME_STMT);
10709
10710   return new_tree;
10711 }
10712
10713 /* Find the edge that's the final one in the path from SRC to DEST and
10714    return it.  This edge must exist in at most one forwarder edge between.  */
10715
10716 static edge
10717 find_connected_edge (edge src, basic_block dest)
10718 {
10719    if (src->dest == dest)
10720      return src;
10721
10722   return find_edge (src->dest, dest);
10723 }
10724
10725 /* Function vectorizable_live_operation.
10726
10727    STMT_INFO computes a value that is used outside the loop.  Check if
10728    it can be supported.  */
10729
10730 bool
10731 vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
10732                              slp_tree slp_node, slp_instance slp_node_instance,
10733                              int slp_index, bool vec_stmt_p,
10734                              stmt_vector_for_cost *cost_vec)
10735 {
10736   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
10737   imm_use_iterator imm_iter;
10738   tree lhs, lhs_type, bitsize;
10739   tree vectype = (slp_node
10740                   ? SLP_TREE_VECTYPE (slp_node)
10741                   : STMT_VINFO_VECTYPE (stmt_info));
10742   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10743   int ncopies;
10744   gimple *use_stmt;
10745   use_operand_p use_p;
10746   auto_vec<tree> vec_oprnds;
10747   int vec_entry = 0;
10748   poly_uint64 vec_index = 0;
10749
10750   gcc_assert (STMT_VINFO_LIVE_P (stmt_info)
10751               || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
10752
10753   /* If a stmt of a reduction is live, vectorize it via
10754      vect_create_epilog_for_reduction.  vectorizable_reduction assessed
10755      validity so just trigger the transform here.  */
10756   if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
10757     {
10758       if (!vec_stmt_p)
10759         return true;
10760       if (slp_node)
10761         {
10762           /* For reduction chains the meta-info is attached to
10763              the group leader.  */
10764           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
10765             stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
10766           /* For SLP reductions we vectorize the epilogue for
10767              all involved stmts together.  */
10768           else if (slp_index != 0)
10769             return true;
10770         }
10771       stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
10772       gcc_assert (reduc_info->is_reduc_info);
10773       if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
10774           || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
10775         return true;
10776
10777       vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
10778                                         slp_node_instance,
10779                                         LOOP_VINFO_IV_EXIT (loop_vinfo));
10780
10781       /* If early break we only have to materialize the reduction on the merge
10782          block, but we have to find an alternate exit first.  */
10783       if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
10784         {
10785           for (auto exit : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
10786             if (exit != LOOP_VINFO_IV_EXIT (loop_vinfo))
10787               {
10788                 vect_create_epilog_for_reduction (loop_vinfo, stmt_info,
10789                                                   slp_node, slp_node_instance,
10790                                                   exit);
10791                 break;
10792               }
10793         }
10794
10795       return true;
10796     }
10797
10798   /* If STMT is not relevant and it is a simple assignment and its inputs are
10799      invariant then it can remain in place, unvectorized.  The original last
10800      scalar value that it computes will be used.  */
10801   if (!STMT_VINFO_RELEVANT_P (stmt_info))
10802     {
10803       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
10804       if (dump_enabled_p ())
10805         dump_printf_loc (MSG_NOTE, vect_location,
10806                          "statement is simple and uses invariant.  Leaving in "
10807                          "place.\n");
10808       return true;
10809     }
10810
10811   if (slp_node)
10812     ncopies = 1;
10813   else
10814     ncopies = vect_get_num_copies (loop_vinfo, vectype);
10815
10816   if (slp_node)
10817     {
10818       gcc_assert (slp_index >= 0);
10819
10820       /* Get the last occurrence of the scalar index from the concatenation of
10821          all the slp vectors. Calculate which slp vector it is and the index
10822          within.  */
10823       int num_scalar = SLP_TREE_LANES (slp_node);
10824       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10825       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
10826
10827       /* Calculate which vector contains the result, and which lane of
10828          that vector we need.  */
10829       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
10830         {
10831           if (dump_enabled_p ())
10832             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10833                              "Cannot determine which vector holds the"
10834                              " final result.\n");
10835           return false;
10836         }
10837     }
10838
10839   if (!vec_stmt_p)
10840     {
10841       /* No transformation required.  */
10842       if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
10843         {
10844           if (slp_node)
10845             {
10846               if (dump_enabled_p ())
10847                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10848                                  "can't operate on partial vectors "
10849                                  "because an SLP statement is live after "
10850                                  "the loop.\n");
10851               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10852             }
10853           else if (ncopies > 1)
10854             {
10855               if (dump_enabled_p ())
10856                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10857                                  "can't operate on partial vectors "
10858                                  "because ncopies is greater than 1.\n");
10859               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10860             }
10861           else
10862             {
10863               gcc_assert (ncopies == 1 && !slp_node);
10864               if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
10865                                                   OPTIMIZE_FOR_SPEED))
10866                 vect_record_loop_mask (loop_vinfo,
10867                                        &LOOP_VINFO_MASKS (loop_vinfo),
10868                                        1, vectype, NULL);
10869               else if (can_vec_extract_var_idx_p (
10870                          TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
10871                 vect_record_loop_len (loop_vinfo,
10872                                       &LOOP_VINFO_LENS (loop_vinfo),
10873                                       1, vectype, 1);
10874               else
10875                 {
10876                   if (dump_enabled_p ())
10877                     dump_printf_loc (
10878                       MSG_MISSED_OPTIMIZATION, vect_location,
10879                       "can't operate on partial vectors "
10880                       "because the target doesn't support extract "
10881                       "last reduction.\n");
10882                   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10883                 }
10884             }
10885         }
10886       /* ???  Enable for loop costing as well.  */
10887       if (!loop_vinfo)
10888         record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
10889                           0, vect_epilogue);
10890       return true;
10891     }
10892
10893   /* Use the lhs of the original scalar statement.  */
10894   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
10895   if (dump_enabled_p ())
10896     dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
10897                      "stmt %G", stmt);
10898
10899   lhs = gimple_get_lhs (stmt);
10900   lhs_type = TREE_TYPE (lhs);
10901
10902   bitsize = vector_element_bits_tree (vectype);
10903
10904   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
10905   tree vec_lhs, vec_lhs0, bitstart;
10906   gimple *vec_stmt, *vec_stmt0;
10907   if (slp_node)
10908     {
10909       gcc_assert (!loop_vinfo
10910                   || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
10911                       && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)));
10912
10913       /* Get the correct slp vectorized stmt.  */
10914       vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
10915       vec_stmt = SSA_NAME_DEF_STMT (vec_lhs);
10916
10917       /* In case we need to early break vectorize also get the first stmt.  */
10918       vec_lhs0 = SLP_TREE_VEC_DEFS (slp_node)[0];
10919       vec_stmt0 = SSA_NAME_DEF_STMT (vec_lhs0);
10920
10921       /* Get entry to use.  */
10922       bitstart = bitsize_int (vec_index);
10923       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
10924     }
10925   else
10926     {
10927       /* For multiple copies, get the last copy.  */
10928       vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
10929       vec_lhs = gimple_get_lhs (vec_stmt);
10930
10931       /* In case we need to early break vectorize also get the first stmt.  */
10932       vec_stmt0 = STMT_VINFO_VEC_STMTS (stmt_info)[0];
10933       vec_lhs0 = gimple_get_lhs (vec_stmt0);
10934
10935       /* Get the last lane in the vector.  */
10936       bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
10937     }
10938
10939   if (loop_vinfo)
10940     {
10941       /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
10942          requirement, insert one phi node for it.  It looks like:
10943            loop;
10944          BB:
10945            # lhs' = PHI <lhs>
10946          ==>
10947            loop;
10948          BB:
10949            # vec_lhs' = PHI <vec_lhs>
10950            new_tree = lane_extract <vec_lhs', ...>;
10951            lhs' = new_tree;  */
10952
10953       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10954       /* Check if we have a loop where the chosen exit is not the main exit,
10955          in these cases for an early break we restart the iteration the vector code
10956          did.  For the live values we want the value at the start of the iteration
10957          rather than at the end.  */
10958       edge main_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
10959       bool restart_loop = LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo);
10960       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10961         if (!is_gimple_debug (use_stmt)
10962             && !flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
10963           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10964             {
10965               edge e = gimple_phi_arg_edge (as_a <gphi *> (use_stmt),
10966                                            phi_arg_index_from_use (use_p));
10967               bool main_exit_edge = e == main_e
10968                                     || find_connected_edge (main_e, e->src);
10969
10970               /* Early exits have an merge block, we want the merge block itself
10971                  so use ->src.  For main exit the merge block is the
10972                  destination.  */
10973               basic_block dest = main_exit_edge ? main_e->dest : e->src;
10974               tree tmp_vec_lhs = vec_lhs;
10975               tree tmp_bitstart = bitstart;
10976
10977               /* For early exit where the exit is not in the BB that leads
10978                  to the latch then we're restarting the iteration in the
10979                  scalar loop.  So get the first live value.  */
10980               restart_loop = restart_loop || !main_exit_edge;
10981               if (restart_loop
10982                   && STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
10983                 {
10984                   tmp_vec_lhs = vec_lhs0;
10985                   tmp_bitstart = build_zero_cst (TREE_TYPE (bitstart));
10986                 }
10987
10988               gimple_stmt_iterator exit_gsi;
10989               tree new_tree
10990                 = vectorizable_live_operation_1 (loop_vinfo, stmt_info,
10991                                                  dest, vectype, ncopies,
10992                                                  slp_node, bitsize,
10993                                                  tmp_bitstart, tmp_vec_lhs,
10994                                                  lhs_type, &exit_gsi);
10995
10996               if (gimple_phi_num_args (use_stmt) == 1)
10997                 {
10998                   auto gsi = gsi_for_stmt (use_stmt);
10999                   remove_phi_node (&gsi, false);
11000                   tree lhs_phi = gimple_phi_result (use_stmt);
11001                   gimple *copy = gimple_build_assign (lhs_phi, new_tree);
11002                   gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
11003                 }
11004               else
11005                 SET_PHI_ARG_DEF (use_stmt, e->dest_idx, new_tree);
11006           }
11007
11008       /* There a no further out-of-loop uses of lhs by LC-SSA construction.  */
11009       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
11010         gcc_assert (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)));
11011     }
11012   else
11013     {
11014       /* For basic-block vectorization simply insert the lane-extraction.  */
11015       tree bftype = TREE_TYPE (vectype);
11016       if (VECTOR_BOOLEAN_TYPE_P (vectype))
11017         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
11018       tree new_tree = build3 (BIT_FIELD_REF, bftype,
11019                               vec_lhs, bitsize, bitstart);
11020       gimple_seq stmts = NULL;
11021       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
11022                                        &stmts, true, NULL_TREE);
11023       if (TREE_CODE (new_tree) == SSA_NAME
11024           && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
11025         SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
11026       if (is_a <gphi *> (vec_stmt))
11027         {
11028           gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
11029           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
11030         }
11031       else
11032         {
11033           gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
11034           gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
11035         }
11036
11037       /* Replace use of lhs with newly computed result.  If the use stmt is a
11038          single arg PHI, just replace all uses of PHI result.  It's necessary
11039          because lcssa PHI defining lhs may be before newly inserted stmt.  */
11040       use_operand_p use_p;
11041       stmt_vec_info use_stmt_info;
11042       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
11043         if (!is_gimple_debug (use_stmt)
11044             && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
11045                 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
11046           {
11047             /* ???  This can happen when the live lane ends up being
11048                rooted in a vector construction code-generated by an
11049                external SLP node (and code-generation for that already
11050                happened).  See gcc.dg/vect/bb-slp-47.c.
11051                Doing this is what would happen if that vector CTOR
11052                were not code-generated yet so it is not too bad.
11053                ???  In fact we'd likely want to avoid this situation
11054                in the first place.  */
11055             if (TREE_CODE (new_tree) == SSA_NAME
11056                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
11057                 && gimple_code (use_stmt) != GIMPLE_PHI
11058                 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
11059                                                 use_stmt))
11060               {
11061                 if (dump_enabled_p ())
11062                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11063                                    "Using original scalar computation for "
11064                                    "live lane because use preceeds vector "
11065                                    "def\n");
11066                 continue;
11067               }
11068             /* ???  It can also happen that we end up pulling a def into
11069                a loop where replacing out-of-loop uses would require
11070                a new LC SSA PHI node.  Retain the original scalar in
11071                those cases as well.  PR98064.  */
11072             if (TREE_CODE (new_tree) == SSA_NAME
11073                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
11074                 && (gimple_bb (use_stmt)->loop_father
11075                     != gimple_bb (vec_stmt)->loop_father)
11076                 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
11077                                         gimple_bb (use_stmt)->loop_father))
11078               {
11079                 if (dump_enabled_p ())
11080                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11081                                    "Using original scalar computation for "
11082                                    "live lane because there is an out-of-loop "
11083                                    "definition for it\n");
11084                 continue;
11085               }
11086             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
11087               SET_USE (use_p, new_tree);
11088             update_stmt (use_stmt);
11089           }
11090     }
11091
11092   return true;
11093 }
11094
11095 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
11096
11097 static void
11098 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
11099 {
11100   ssa_op_iter op_iter;
11101   imm_use_iterator imm_iter;
11102   def_operand_p def_p;
11103   gimple *ustmt;
11104
11105   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
11106     {
11107       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
11108         {
11109           basic_block bb;
11110
11111           if (!is_gimple_debug (ustmt))
11112             continue;
11113
11114           bb = gimple_bb (ustmt);
11115
11116           if (!flow_bb_inside_loop_p (loop, bb))
11117             {
11118               if (gimple_debug_bind_p (ustmt))
11119                 {
11120                   if (dump_enabled_p ())
11121                     dump_printf_loc (MSG_NOTE, vect_location,
11122                                      "killing debug use\n");
11123
11124                   gimple_debug_bind_reset_value (ustmt);
11125                   update_stmt (ustmt);
11126                 }
11127               else
11128                 gcc_unreachable ();
11129             }
11130         }
11131     }
11132 }
11133
11134 /* Given loop represented by LOOP_VINFO, return true if computation of
11135    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
11136    otherwise.  */
11137
11138 static bool
11139 loop_niters_no_overflow (loop_vec_info loop_vinfo)
11140 {
11141   /* Constant case.  */
11142   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
11143     {
11144       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
11145       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
11146
11147       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
11148       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
11149       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
11150         return true;
11151     }
11152
11153   widest_int max;
11154   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11155   /* Check the upper bound of loop niters.  */
11156   if (get_max_loop_iterations (loop, &max))
11157     {
11158       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
11159       signop sgn = TYPE_SIGN (type);
11160       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
11161       if (max < type_max)
11162         return true;
11163     }
11164   return false;
11165 }
11166
11167 /* Return a mask type with half the number of elements as OLD_TYPE,
11168    given that it should have mode NEW_MODE.  */
11169
11170 tree
11171 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
11172 {
11173   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
11174   return build_truth_vector_type_for_mode (nunits, new_mode);
11175 }
11176
11177 /* Return a mask type with twice as many elements as OLD_TYPE,
11178    given that it should have mode NEW_MODE.  */
11179
11180 tree
11181 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
11182 {
11183   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
11184   return build_truth_vector_type_for_mode (nunits, new_mode);
11185 }
11186
11187 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
11188    contain a sequence of NVECTORS masks that each control a vector of type
11189    VECTYPE.  If SCALAR_MASK is nonnull, the fully-masked loop would AND
11190    these vector masks with the vector version of SCALAR_MASK.  */
11191
11192 void
11193 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
11194                        unsigned int nvectors, tree vectype, tree scalar_mask)
11195 {
11196   gcc_assert (nvectors != 0);
11197
11198   if (scalar_mask)
11199     {
11200       scalar_cond_masked_key cond (scalar_mask, nvectors);
11201       loop_vinfo->scalar_cond_masked_set.add (cond);
11202     }
11203
11204   masks->mask_set.add (std::make_pair (vectype, nvectors));
11205 }
11206
11207 /* Given a complete set of masks MASKS, extract mask number INDEX
11208    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
11209    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
11210
11211    See the comment above vec_loop_masks for more details about the mask
11212    arrangement.  */
11213
11214 tree
11215 vect_get_loop_mask (loop_vec_info loop_vinfo,
11216                     gimple_stmt_iterator *gsi, vec_loop_masks *masks,
11217                     unsigned int nvectors, tree vectype, unsigned int index)
11218 {
11219   if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
11220       == vect_partial_vectors_while_ult)
11221     {
11222       rgroup_controls *rgm = &(masks->rgc_vec)[nvectors - 1];
11223       tree mask_type = rgm->type;
11224
11225       /* Populate the rgroup's mask array, if this is the first time we've
11226          used it.  */
11227       if (rgm->controls.is_empty ())
11228         {
11229           rgm->controls.safe_grow_cleared (nvectors, true);
11230           for (unsigned int i = 0; i < nvectors; ++i)
11231             {
11232               tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
11233               /* Provide a dummy definition until the real one is available.  */
11234               SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
11235               rgm->controls[i] = mask;
11236             }
11237         }
11238
11239       tree mask = rgm->controls[index];
11240       if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
11241                     TYPE_VECTOR_SUBPARTS (vectype)))
11242         {
11243           /* A loop mask for data type X can be reused for data type Y
11244              if X has N times more elements than Y and if Y's elements
11245              are N times bigger than X's.  In this case each sequence
11246              of N elements in the loop mask will be all-zero or all-one.
11247              We can then view-convert the mask so that each sequence of
11248              N elements is replaced by a single element.  */
11249           gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
11250                                   TYPE_VECTOR_SUBPARTS (vectype)));
11251           gimple_seq seq = NULL;
11252           mask_type = truth_type_for (vectype);
11253           mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
11254           if (seq)
11255             gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11256         }
11257       return mask;
11258     }
11259   else if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
11260            == vect_partial_vectors_avx512)
11261     {
11262       /* The number of scalars per iteration and the number of vectors are
11263          both compile-time constants.  */
11264       unsigned int nscalars_per_iter
11265         = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11266                      LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
11267
11268       rgroup_controls *rgm = &masks->rgc_vec[nscalars_per_iter - 1];
11269
11270       /* The stored nV is dependent on the mask type produced.  */
11271       gcc_assert (exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11272                              TYPE_VECTOR_SUBPARTS (rgm->type)).to_constant ()
11273                   == rgm->factor);
11274       nvectors = rgm->factor;
11275
11276       /* Populate the rgroup's mask array, if this is the first time we've
11277          used it.  */
11278       if (rgm->controls.is_empty ())
11279         {
11280           rgm->controls.safe_grow_cleared (nvectors, true);
11281           for (unsigned int i = 0; i < nvectors; ++i)
11282             {
11283               tree mask = make_temp_ssa_name (rgm->type, NULL, "loop_mask");
11284               /* Provide a dummy definition until the real one is available.  */
11285               SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
11286               rgm->controls[i] = mask;
11287             }
11288         }
11289       if (known_eq (TYPE_VECTOR_SUBPARTS (rgm->type),
11290                     TYPE_VECTOR_SUBPARTS (vectype)))
11291         return rgm->controls[index];
11292
11293       /* Split the vector if needed.  Since we are dealing with integer mode
11294          masks with AVX512 we can operate on the integer representation
11295          performing the whole vector shifting.  */
11296       unsigned HOST_WIDE_INT factor;
11297       bool ok = constant_multiple_p (TYPE_VECTOR_SUBPARTS (rgm->type),
11298                                      TYPE_VECTOR_SUBPARTS (vectype), &factor);
11299       gcc_assert (ok);
11300       gcc_assert (GET_MODE_CLASS (TYPE_MODE (rgm->type)) == MODE_INT);
11301       tree mask_type = truth_type_for (vectype);
11302       gcc_assert (GET_MODE_CLASS (TYPE_MODE (mask_type)) == MODE_INT);
11303       unsigned vi = index / factor;
11304       unsigned vpart = index % factor;
11305       tree vec = rgm->controls[vi];
11306       gimple_seq seq = NULL;
11307       vec = gimple_build (&seq, VIEW_CONVERT_EXPR,
11308                           lang_hooks.types.type_for_mode
11309                                 (TYPE_MODE (rgm->type), 1), vec);
11310       /* For integer mode masks simply shift the right bits into position.  */
11311       if (vpart != 0)
11312         vec = gimple_build (&seq, RSHIFT_EXPR, TREE_TYPE (vec), vec,
11313                             build_int_cst (integer_type_node,
11314                                            (TYPE_VECTOR_SUBPARTS (vectype)
11315                                             * vpart)));
11316       vec = gimple_convert (&seq, lang_hooks.types.type_for_mode
11317                                     (TYPE_MODE (mask_type), 1), vec);
11318       vec = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, vec);
11319       if (seq)
11320         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11321       return vec;
11322     }
11323   else
11324     gcc_unreachable ();
11325 }
11326
11327 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
11328    lengths for controlling an operation on VECTYPE.  The operation splits
11329    each element of VECTYPE into FACTOR separate subelements, measuring the
11330    length as a number of these subelements.  */
11331
11332 void
11333 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
11334                       unsigned int nvectors, tree vectype, unsigned int factor)
11335 {
11336   gcc_assert (nvectors != 0);
11337   if (lens->length () < nvectors)
11338     lens->safe_grow_cleared (nvectors, true);
11339   rgroup_controls *rgl = &(*lens)[nvectors - 1];
11340
11341   /* The number of scalars per iteration, scalar occupied bytes and
11342      the number of vectors are both compile-time constants.  */
11343   unsigned int nscalars_per_iter
11344     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11345                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
11346
11347   if (rgl->max_nscalars_per_iter < nscalars_per_iter)
11348     {
11349       /* For now, we only support cases in which all loads and stores fall back
11350          to VnQI or none do.  */
11351       gcc_assert (!rgl->max_nscalars_per_iter
11352                   || (rgl->factor == 1 && factor == 1)
11353                   || (rgl->max_nscalars_per_iter * rgl->factor
11354                       == nscalars_per_iter * factor));
11355       rgl->max_nscalars_per_iter = nscalars_per_iter;
11356       rgl->type = vectype;
11357       rgl->factor = factor;
11358     }
11359 }
11360
11361 /* Given a complete set of lengths LENS, extract length number INDEX
11362    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
11363    where 0 <= INDEX < NVECTORS.  Return a value that contains FACTOR
11364    multipled by the number of elements that should be processed.
11365    Insert any set-up statements before GSI.  */
11366
11367 tree
11368 vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
11369                    vec_loop_lens *lens, unsigned int nvectors, tree vectype,
11370                    unsigned int index, unsigned int factor)
11371 {
11372   rgroup_controls *rgl = &(*lens)[nvectors - 1];
11373   bool use_bias_adjusted_len =
11374     LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
11375
11376   /* Populate the rgroup's len array, if this is the first time we've
11377      used it.  */
11378   if (rgl->controls.is_empty ())
11379     {
11380       rgl->controls.safe_grow_cleared (nvectors, true);
11381       for (unsigned int i = 0; i < nvectors; ++i)
11382         {
11383           tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
11384           gcc_assert (len_type != NULL_TREE);
11385
11386           tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
11387
11388           /* Provide a dummy definition until the real one is available.  */
11389           SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
11390           rgl->controls[i] = len;
11391
11392           if (use_bias_adjusted_len)
11393             {
11394               gcc_assert (i == 0);
11395               tree adjusted_len =
11396                 make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
11397               SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
11398               rgl->bias_adjusted_ctrl = adjusted_len;
11399             }
11400         }
11401     }
11402
11403   if (use_bias_adjusted_len)
11404     return rgl->bias_adjusted_ctrl;
11405
11406   tree loop_len = rgl->controls[index];
11407   if (rgl->factor == 1 && factor == 1)
11408     {
11409       poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
11410       poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
11411       if (maybe_ne (nunits1, nunits2))
11412         {
11413           /* A loop len for data type X can be reused for data type Y
11414              if X has N times more elements than Y and if Y's elements
11415              are N times bigger than X's.  */
11416           gcc_assert (multiple_p (nunits1, nunits2));
11417           factor = exact_div (nunits1, nunits2).to_constant ();
11418           tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
11419           gimple_seq seq = NULL;
11420           loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
11421                                    build_int_cst (iv_type, factor));
11422           if (seq)
11423             gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11424         }
11425     }
11426   return loop_len;
11427 }
11428
11429 /* Scale profiling counters by estimation for LOOP which is vectorized
11430    by factor VF.
11431    If FLAT is true, the loop we started with had unrealistically flat
11432    profile.  */
11433
11434 static void
11435 scale_profile_for_vect_loop (class loop *loop, edge exit_e, unsigned vf, bool flat)
11436 {
11437   /* For flat profiles do not scale down proportionally by VF and only
11438      cap by known iteration count bounds.  */
11439   if (flat)
11440     {
11441       if (dump_file && (dump_flags & TDF_DETAILS))
11442         fprintf (dump_file,
11443                  "Vectorized loop profile seems flat; not scaling iteration "
11444                  "count down by the vectorization factor %i\n", vf);
11445       scale_loop_profile (loop, profile_probability::always (),
11446                           get_likely_max_loop_iterations_int (loop));
11447       return;
11448     }
11449   /* Loop body executes VF fewer times and exit increases VF times.  */
11450   profile_count entry_count = loop_preheader_edge (loop)->count ();
11451
11452   /* If we have unreliable loop profile avoid dropping entry
11453      count bellow header count.  This can happen since loops
11454      has unrealistically low trip counts.  */
11455   while (vf > 1
11456          && loop->header->count > entry_count
11457          && loop->header->count < entry_count * vf)
11458     {
11459       if (dump_file && (dump_flags & TDF_DETAILS))
11460         fprintf (dump_file,
11461                  "Vectorization factor %i seems too large for profile "
11462                  "prevoiusly believed to be consistent; reducing.\n", vf);
11463       vf /= 2;
11464     }
11465
11466   if (entry_count.nonzero_p ())
11467     set_edge_probability_and_rescale_others
11468             (exit_e,
11469              entry_count.probability_in (loop->header->count / vf));
11470   /* Avoid producing very large exit probability when we do not have
11471      sensible profile.  */
11472   else if (exit_e->probability < profile_probability::always () / (vf * 2))
11473     set_edge_probability_and_rescale_others (exit_e, exit_e->probability * vf);
11474   loop->latch->count = single_pred_edge (loop->latch)->count ();
11475
11476   scale_loop_profile (loop, profile_probability::always () / vf,
11477                       get_likely_max_loop_iterations_int (loop));
11478 }
11479
11480 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
11481    latch edge values originally defined by it.  */
11482
11483 static void
11484 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
11485                                      stmt_vec_info def_stmt_info)
11486 {
11487   tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
11488   if (!def || TREE_CODE (def) != SSA_NAME)
11489     return;
11490   stmt_vec_info phi_info;
11491   imm_use_iterator iter;
11492   use_operand_p use_p;
11493   FOR_EACH_IMM_USE_FAST (use_p, iter, def)
11494     {
11495       gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p));
11496       if (!phi)
11497         continue;
11498       if (!(gimple_bb (phi)->loop_father->header == gimple_bb (phi)
11499             && (phi_info = loop_vinfo->lookup_stmt (phi))
11500             && STMT_VINFO_RELEVANT_P (phi_info)))
11501         continue;
11502       loop_p loop = gimple_bb (phi)->loop_father;
11503       edge e = loop_latch_edge (loop);
11504       if (PHI_ARG_DEF_FROM_EDGE (phi, e) != def)
11505         continue;
11506
11507       if (VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
11508           && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
11509           && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
11510         {
11511           vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11512           vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11513           gcc_assert (phi_defs.length () == latch_defs.length ());
11514           for (unsigned i = 0; i < phi_defs.length (); ++i)
11515             add_phi_arg (as_a <gphi *> (phi_defs[i]),
11516                          gimple_get_lhs (latch_defs[i]), e,
11517                          gimple_phi_arg_location (phi, e->dest_idx));
11518         }
11519       else if (STMT_VINFO_DEF_TYPE (phi_info) == vect_first_order_recurrence)
11520         {
11521           /* For first order recurrences we have to update both uses of
11522              the latch definition, the one in the PHI node and the one
11523              in the generated VEC_PERM_EXPR.  */
11524           vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11525           vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11526           gcc_assert (phi_defs.length () == latch_defs.length ());
11527           tree phidef = gimple_assign_rhs1 (phi_defs[0]);
11528           gphi *vphi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
11529           for (unsigned i = 0; i < phi_defs.length (); ++i)
11530             {
11531               gassign *perm = as_a <gassign *> (phi_defs[i]);
11532               if (i > 0)
11533                 gimple_assign_set_rhs1 (perm, gimple_get_lhs (latch_defs[i-1]));
11534               gimple_assign_set_rhs2 (perm, gimple_get_lhs (latch_defs[i]));
11535               update_stmt (perm);
11536             }
11537           add_phi_arg (vphi, gimple_get_lhs (latch_defs.last ()), e,
11538                        gimple_phi_arg_location (phi, e->dest_idx));
11539         }
11540     }
11541 }
11542
11543 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
11544    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
11545    stmt_vec_info.  */
11546
11547 static bool
11548 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
11549                           gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
11550 {
11551   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11552   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11553
11554   if (dump_enabled_p ())
11555     dump_printf_loc (MSG_NOTE, vect_location,
11556                      "------>vectorizing statement: %G", stmt_info->stmt);
11557
11558   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
11559     vect_loop_kill_debug_uses (loop, stmt_info);
11560
11561   if (!STMT_VINFO_RELEVANT_P (stmt_info)
11562       && !STMT_VINFO_LIVE_P (stmt_info))
11563     {
11564       if (is_gimple_call (stmt_info->stmt)
11565           && gimple_call_internal_p (stmt_info->stmt, IFN_MASK_CALL))
11566         {
11567           gcc_assert (!gimple_call_lhs (stmt_info->stmt));
11568           *seen_store = stmt_info;
11569           return false;
11570         }
11571       return false;
11572     }
11573
11574   if (STMT_VINFO_VECTYPE (stmt_info))
11575     {
11576       poly_uint64 nunits
11577         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
11578       if (!STMT_SLP_TYPE (stmt_info)
11579           && maybe_ne (nunits, vf)
11580           && dump_enabled_p ())
11581         /* For SLP VF is set according to unrolling factor, and not
11582            to vector size, hence for SLP this print is not valid.  */
11583         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
11584     }
11585
11586   /* Pure SLP statements have already been vectorized.  We still need
11587      to apply loop vectorization to hybrid SLP statements.  */
11588   if (PURE_SLP_STMT (stmt_info))
11589     return false;
11590
11591   if (dump_enabled_p ())
11592     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
11593
11594   if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
11595     *seen_store = stmt_info;
11596
11597   return true;
11598 }
11599
11600 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
11601    in the hash_map with its corresponding values.  */
11602
11603 static tree
11604 find_in_mapping (tree t, void *context)
11605 {
11606   hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
11607
11608   tree *value = mapping->get (t);
11609   return value ? *value : t;
11610 }
11611
11612 /* Update EPILOGUE's loop_vec_info.  EPILOGUE was constructed as a copy of the
11613    original loop that has now been vectorized.
11614
11615    The inits of the data_references need to be advanced with the number of
11616    iterations of the main loop.  This has been computed in vect_do_peeling and
11617    is stored in parameter ADVANCE.  We first restore the data_references
11618    initial offset with the values recored in ORIG_DRS_INIT.
11619
11620    Since the loop_vec_info of this EPILOGUE was constructed for the original
11621    loop, its stmt_vec_infos all point to the original statements.  These need
11622    to be updated to point to their corresponding copies as well as the SSA_NAMES
11623    in their PATTERN_DEF_SEQs and RELATED_STMTs.
11624
11625    The data_reference's connections also need to be updated.  Their
11626    corresponding dr_vec_info need to be reconnected to the EPILOGUE's
11627    stmt_vec_infos, their statements need to point to their corresponding copy,
11628    if they are gather loads or scatter stores then their reference needs to be
11629    updated to point to its corresponding copy and finally we set
11630    'base_misaligned' to false as we have already peeled for alignment in the
11631    prologue of the main loop.  */
11632
11633 static void
11634 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
11635 {
11636   loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
11637   auto_vec<gimple *> stmt_worklist;
11638   hash_map<tree,tree> mapping;
11639   gimple *orig_stmt, *new_stmt;
11640   gimple_stmt_iterator epilogue_gsi;
11641   gphi_iterator epilogue_phi_gsi;
11642   stmt_vec_info stmt_vinfo = NULL, related_vinfo;
11643   basic_block *epilogue_bbs = get_loop_body (epilogue);
11644   unsigned i;
11645
11646   free (LOOP_VINFO_BBS (epilogue_vinfo));
11647   LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
11648
11649   /* Advance data_reference's with the number of iterations of the previous
11650      loop and its prologue.  */
11651   vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
11652
11653
11654   /* The EPILOGUE loop is a copy of the original loop so they share the same
11655      gimple UIDs.  In this loop we update the loop_vec_info of the EPILOGUE to
11656      point to the copied statements.  We also create a mapping of all LHS' in
11657      the original loop and all the LHS' in the EPILOGUE and create worklists to
11658      update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs.  */
11659   for (unsigned i = 0; i < epilogue->num_nodes; ++i)
11660     {
11661       for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
11662            !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
11663         {
11664           new_stmt = epilogue_phi_gsi.phi ();
11665
11666           gcc_assert (gimple_uid (new_stmt) > 0);
11667           stmt_vinfo
11668             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11669
11670           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11671           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11672
11673           mapping.put (gimple_phi_result (orig_stmt),
11674                        gimple_phi_result (new_stmt));
11675           /* PHI nodes can not have patterns or related statements.  */
11676           gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
11677                       && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
11678         }
11679
11680       for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
11681            !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
11682         {
11683           new_stmt = gsi_stmt (epilogue_gsi);
11684           if (is_gimple_debug (new_stmt))
11685             continue;
11686
11687           gcc_assert (gimple_uid (new_stmt) > 0);
11688           stmt_vinfo
11689             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11690
11691           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11692           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11693
11694           if (tree old_lhs = gimple_get_lhs (orig_stmt))
11695             mapping.put (old_lhs, gimple_get_lhs (new_stmt));
11696
11697           if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
11698             {
11699               gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
11700               for (gimple_stmt_iterator gsi = gsi_start (seq);
11701                    !gsi_end_p (gsi); gsi_next (&gsi))
11702                 stmt_worklist.safe_push (gsi_stmt (gsi));
11703             }
11704
11705           related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
11706           if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
11707             {
11708               gimple *stmt = STMT_VINFO_STMT (related_vinfo);
11709               stmt_worklist.safe_push (stmt);
11710               /* Set BB such that the assert in
11711                 'get_initial_def_for_reduction' is able to determine that
11712                 the BB of the related stmt is inside this loop.  */
11713               gimple_set_bb (stmt,
11714                              gimple_bb (new_stmt));
11715               related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
11716               gcc_assert (related_vinfo == NULL
11717                           || related_vinfo == stmt_vinfo);
11718             }
11719         }
11720     }
11721
11722   /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
11723      using the original main loop and thus need to be updated to refer to the
11724      cloned variables used in the epilogue.  */
11725   for (unsigned i = 0; i < stmt_worklist.length (); ++i)
11726     {
11727       gimple *stmt = stmt_worklist[i];
11728       tree *new_op;
11729
11730       for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
11731         {
11732           tree op = gimple_op (stmt, j);
11733           if ((new_op = mapping.get(op)))
11734             gimple_set_op (stmt, j, *new_op);
11735           else
11736             {
11737               /* PR92429: The last argument of simplify_replace_tree disables
11738                  folding when replacing arguments.  This is required as
11739                  otherwise you might end up with different statements than the
11740                  ones analyzed in vect_loop_analyze, leading to different
11741                  vectorization.  */
11742               op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
11743                                           &find_in_mapping, &mapping, false);
11744               gimple_set_op (stmt, j, op);
11745             }
11746         }
11747     }
11748
11749   struct data_reference *dr;
11750   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
11751   FOR_EACH_VEC_ELT (datarefs, i, dr)
11752     {
11753       orig_stmt = DR_STMT (dr);
11754       gcc_assert (gimple_uid (orig_stmt) > 0);
11755       stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
11756       /* Data references for gather loads and scatter stores do not use the
11757          updated offset we set using ADVANCE.  Instead we have to make sure the
11758          reference in the data references point to the corresponding copy of
11759          the original in the epilogue.  Make sure to update both
11760          gather/scatters recognized by dataref analysis and also other
11761          refs that get_load_store_type classified as VMAT_GATHER_SCATTER.  */
11762       auto vstmt_vinfo = vect_stmt_to_vectorize (stmt_vinfo);
11763       if (STMT_VINFO_MEMORY_ACCESS_TYPE (vstmt_vinfo) == VMAT_GATHER_SCATTER
11764           || STMT_VINFO_GATHER_SCATTER_P (vstmt_vinfo))
11765         {
11766           DR_REF (dr)
11767             = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
11768                                      &find_in_mapping, &mapping);
11769           DR_BASE_ADDRESS (dr)
11770             = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
11771                                      &find_in_mapping, &mapping);
11772         }
11773       DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
11774       stmt_vinfo->dr_aux.stmt = stmt_vinfo;
11775       /* The vector size of the epilogue is smaller than that of the main loop
11776          so the alignment is either the same or lower. This means the dr will
11777          thus by definition be aligned.  */
11778       STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
11779     }
11780
11781   epilogue_vinfo->shared->datarefs_copy.release ();
11782   epilogue_vinfo->shared->save_datarefs ();
11783 }
11784
11785 /*  When vectorizing early break statements instructions that happen before
11786     the early break in the current BB need to be moved to after the early
11787     break.  This function deals with that and assumes that any validity
11788     checks has already been performed.
11789
11790     While moving the instructions if it encounters a VUSE or VDEF it then
11791     corrects the VUSES as it moves the statements along.  GDEST is the location
11792     in which to insert the new statements.  */
11793
11794 static void
11795 move_early_exit_stmts (loop_vec_info loop_vinfo)
11796 {
11797   DUMP_VECT_SCOPE ("move_early_exit_stmts");
11798
11799   if (LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).is_empty ())
11800     return;
11801
11802   /* Move all stmts that need moving.  */
11803   basic_block dest_bb = LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo);
11804   gimple_stmt_iterator dest_gsi = gsi_start_bb (dest_bb);
11805
11806   for (gimple *stmt : LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo))
11807     {
11808       /* Check to see if statement is still required for vect or has been
11809          elided.  */
11810       auto stmt_info = loop_vinfo->lookup_stmt (stmt);
11811       if (!stmt_info)
11812         continue;
11813
11814       if (dump_enabled_p ())
11815         dump_printf_loc (MSG_NOTE, vect_location, "moving stmt %G", stmt);
11816
11817       gimple_stmt_iterator stmt_gsi = gsi_for_stmt (stmt);
11818       gsi_move_before (&stmt_gsi, &dest_gsi);
11819       gsi_prev (&dest_gsi);
11820     }
11821
11822   /* Update all the stmts with their new reaching VUSES.  */
11823   tree vuse
11824     = gimple_vuse (LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).last ());
11825   for (auto p : LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo))
11826     {
11827       if (dump_enabled_p ())
11828           dump_printf_loc (MSG_NOTE, vect_location,
11829                            "updating vuse to %T for load %G", vuse, p);
11830       gimple_set_vuse (p, vuse);
11831       update_stmt (p);
11832     }
11833
11834   /* And update the LC PHIs on exits.  */
11835   for (edge e : get_loop_exit_edges (LOOP_VINFO_LOOP  (loop_vinfo)))
11836     if (!dominated_by_p (CDI_DOMINATORS, e->src, dest_bb))
11837       if (gphi *phi = get_virtual_phi (e->dest))
11838         SET_PHI_ARG_DEF_ON_EDGE (phi, e, vuse);
11839 }
11840
11841 /* Function vect_transform_loop.
11842
11843    The analysis phase has determined that the loop is vectorizable.
11844    Vectorize the loop - created vectorized stmts to replace the scalar
11845    stmts in the loop, and update the loop exit condition.
11846    Returns scalar epilogue loop if any.  */
11847
11848 class loop *
11849 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
11850 {
11851   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11852   class loop *epilogue = NULL;
11853   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
11854   int nbbs = loop->num_nodes;
11855   int i;
11856   tree niters_vector = NULL_TREE;
11857   tree step_vector = NULL_TREE;
11858   tree niters_vector_mult_vf = NULL_TREE;
11859   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11860   unsigned int lowest_vf = constant_lower_bound (vf);
11861   gimple *stmt;
11862   bool check_profitability = false;
11863   unsigned int th;
11864   bool flat = maybe_flat_loop_profile (loop);
11865
11866   DUMP_VECT_SCOPE ("vec_transform_loop");
11867
11868   loop_vinfo->shared->check_datarefs ();
11869
11870   /* Use the more conservative vectorization threshold.  If the number
11871      of iterations is constant assume the cost check has been performed
11872      by our caller.  If the threshold makes all loops profitable that
11873      run at least the (estimated) vectorization factor number of times
11874      checking is pointless, too.  */
11875   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
11876   if (vect_apply_runtime_profitability_check_p (loop_vinfo))
11877     {
11878       if (dump_enabled_p ())
11879         dump_printf_loc (MSG_NOTE, vect_location,
11880                          "Profitability threshold is %d loop iterations.\n",
11881                          th);
11882       check_profitability = true;
11883     }
11884
11885   /* Make sure there exists a single-predecessor exit bb.  Do this before
11886      versioning.   */
11887   edge e = LOOP_VINFO_IV_EXIT (loop_vinfo);
11888   if (! single_pred_p (e->dest) && !LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11889     {
11890       split_loop_exit_edge (e, true);
11891       if (dump_enabled_p ())
11892         dump_printf (MSG_NOTE, "split exit edge\n");
11893     }
11894
11895   /* Version the loop first, if required, so the profitability check
11896      comes first.  */
11897
11898   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
11899     {
11900       class loop *sloop
11901         = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
11902       sloop->force_vectorize = false;
11903       check_profitability = false;
11904     }
11905
11906   /* Make sure there exists a single-predecessor exit bb also on the
11907      scalar loop copy.  Do this after versioning but before peeling
11908      so CFG structure is fine for both scalar and if-converted loop
11909      to make slpeel_duplicate_current_defs_from_edges face matched
11910      loop closed PHI nodes on the exit.  */
11911   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
11912     {
11913       e = LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo);
11914       if (! single_pred_p (e->dest))
11915         {
11916           split_loop_exit_edge (e, true);
11917           if (dump_enabled_p ())
11918             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
11919         }
11920     }
11921
11922   tree niters = vect_build_loop_niters (loop_vinfo);
11923   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
11924   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
11925   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
11926   tree advance;
11927   drs_init_vec orig_drs_init;
11928
11929   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
11930                               &step_vector, &niters_vector_mult_vf, th,
11931                               check_profitability, niters_no_overflow,
11932                               &advance);
11933   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
11934       && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
11935     {
11936       /* Ifcvt duplicates loop preheader, loop body and produces an basic
11937          block after loop exit.  We need to scale all that.  */
11938       basic_block preheader
11939         = loop_preheader_edge (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->src;
11940       preheader->count
11941         = preheader->count.apply_probability
11942               (LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11943       scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
11944                               LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11945       LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo)->dest->count = preheader->count;
11946     }
11947
11948   if (niters_vector == NULL_TREE)
11949     {
11950       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
11951           && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
11952           && known_eq (lowest_vf, vf))
11953         {
11954           niters_vector
11955             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
11956                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
11957           step_vector = build_one_cst (TREE_TYPE (niters));
11958         }
11959       else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11960         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
11961                                      &step_vector, niters_no_overflow);
11962       else
11963         /* vect_do_peeling subtracted the number of peeled prologue
11964            iterations from LOOP_VINFO_NITERS.  */
11965         vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
11966                                      &niters_vector, &step_vector,
11967                                      niters_no_overflow);
11968     }
11969
11970   /* 1) Make sure the loop header has exactly two entries
11971      2) Make sure we have a preheader basic block.  */
11972
11973   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
11974
11975   split_edge (loop_preheader_edge (loop));
11976
11977   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11978     /* This will deal with any possible peeling.  */
11979     vect_prepare_for_masked_peels (loop_vinfo);
11980
11981   /* Handle any code motion that we need to for early-break vectorization after
11982      we've done peeling but just before we start vectorizing.  */
11983   if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11984     move_early_exit_stmts (loop_vinfo);
11985
11986   /* Schedule the SLP instances first, then handle loop vectorization
11987      below.  */
11988   if (!loop_vinfo->slp_instances.is_empty ())
11989     {
11990       DUMP_VECT_SCOPE ("scheduling SLP instances");
11991       vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
11992     }
11993
11994   /* FORNOW: the vectorizer supports only loops which body consist
11995      of one basic block (header + empty latch). When the vectorizer will
11996      support more involved loop forms, the order by which the BBs are
11997      traversed need to be reconsidered.  */
11998
11999   for (i = 0; i < nbbs; i++)
12000     {
12001       basic_block bb = bbs[i];
12002       stmt_vec_info stmt_info;
12003
12004       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
12005            gsi_next (&si))
12006         {
12007           gphi *phi = si.phi ();
12008           if (dump_enabled_p ())
12009             dump_printf_loc (MSG_NOTE, vect_location,
12010                              "------>vectorizing phi: %G", (gimple *) phi);
12011           stmt_info = loop_vinfo->lookup_stmt (phi);
12012           if (!stmt_info)
12013             continue;
12014
12015           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
12016             vect_loop_kill_debug_uses (loop, stmt_info);
12017
12018           if (!STMT_VINFO_RELEVANT_P (stmt_info)
12019               && !STMT_VINFO_LIVE_P (stmt_info))
12020             continue;
12021
12022           if (STMT_VINFO_VECTYPE (stmt_info)
12023               && (maybe_ne
12024                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
12025               && dump_enabled_p ())
12026             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
12027
12028           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
12029                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
12030                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
12031                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
12032                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence
12033                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
12034               && ! PURE_SLP_STMT (stmt_info))
12035             {
12036               if (dump_enabled_p ())
12037                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
12038               vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
12039             }
12040         }
12041
12042       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
12043            gsi_next (&si))
12044         {
12045           gphi *phi = si.phi ();
12046           stmt_info = loop_vinfo->lookup_stmt (phi);
12047           if (!stmt_info)
12048             continue;
12049
12050           if (!STMT_VINFO_RELEVANT_P (stmt_info)
12051               && !STMT_VINFO_LIVE_P (stmt_info))
12052             continue;
12053
12054           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
12055                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
12056                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
12057                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
12058                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
12059                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence)
12060               && ! PURE_SLP_STMT (stmt_info))
12061             maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
12062         }
12063
12064       for (gimple_stmt_iterator si = gsi_start_bb (bb);
12065            !gsi_end_p (si);)
12066         {
12067           stmt = gsi_stmt (si);
12068           /* During vectorization remove existing clobber stmts.  */
12069           if (gimple_clobber_p (stmt))
12070             {
12071               unlink_stmt_vdef (stmt);
12072               gsi_remove (&si, true);
12073               release_defs (stmt);
12074             }
12075           else
12076             {
12077               /* Ignore vector stmts created in the outer loop.  */
12078               stmt_info = loop_vinfo->lookup_stmt (stmt);
12079
12080               /* vector stmts created in the outer-loop during vectorization of
12081                  stmts in an inner-loop may not have a stmt_info, and do not
12082                  need to be vectorized.  */
12083               stmt_vec_info seen_store = NULL;
12084               if (stmt_info)
12085                 {
12086                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
12087                     {
12088                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
12089                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
12090                            !gsi_end_p (subsi); gsi_next (&subsi))
12091                         {
12092                           stmt_vec_info pat_stmt_info
12093                             = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
12094                           vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
12095                                                     &si, &seen_store);
12096                         }
12097                       stmt_vec_info pat_stmt_info
12098                         = STMT_VINFO_RELATED_STMT (stmt_info);
12099                       if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
12100                                                     &si, &seen_store))
12101                         maybe_set_vectorized_backedge_value (loop_vinfo,
12102                                                              pat_stmt_info);
12103                     }
12104                   else
12105                     {
12106                       if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
12107                                                     &seen_store))
12108                         maybe_set_vectorized_backedge_value (loop_vinfo,
12109                                                              stmt_info);
12110                     }
12111                 }
12112               gsi_next (&si);
12113               if (seen_store)
12114                 {
12115                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
12116                     /* Interleaving.  If IS_STORE is TRUE, the
12117                        vectorization of the interleaving chain was
12118                        completed - free all the stores in the chain.  */
12119                     vect_remove_stores (loop_vinfo,
12120                                         DR_GROUP_FIRST_ELEMENT (seen_store));
12121                   else
12122                     /* Free the attached stmt_vec_info and remove the stmt.  */
12123                     loop_vinfo->remove_stmt (stmt_info);
12124                 }
12125             }
12126         }
12127
12128       /* Stub out scalar statements that must not survive vectorization.
12129          Doing this here helps with grouped statements, or statements that
12130          are involved in patterns.  */
12131       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
12132            !gsi_end_p (gsi); gsi_next (&gsi))
12133         {
12134           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
12135           if (!call || !gimple_call_internal_p (call))
12136             continue;
12137           internal_fn ifn = gimple_call_internal_fn (call);
12138           if (ifn == IFN_MASK_LOAD)
12139             {
12140               tree lhs = gimple_get_lhs (call);
12141               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12142                 {
12143                   tree zero = build_zero_cst (TREE_TYPE (lhs));
12144                   gimple *new_stmt = gimple_build_assign (lhs, zero);
12145                   gsi_replace (&gsi, new_stmt, true);
12146                 }
12147             }
12148           else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
12149             {
12150               tree lhs = gimple_get_lhs (call);
12151               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12152                 {
12153                   tree else_arg
12154                     = gimple_call_arg (call, gimple_call_num_args (call) - 1);
12155                   gimple *new_stmt = gimple_build_assign (lhs, else_arg);
12156                   gsi_replace (&gsi, new_stmt, true);
12157                 }
12158             }
12159         }
12160     }                           /* BBs in loop */
12161
12162   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
12163      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
12164   if (integer_onep (step_vector))
12165     niters_no_overflow = true;
12166   vect_set_loop_condition (loop, LOOP_VINFO_IV_EXIT (loop_vinfo), loop_vinfo,
12167                            niters_vector, step_vector, niters_vector_mult_vf,
12168                            !niters_no_overflow);
12169
12170   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
12171
12172   /* True if the final iteration might not handle a full vector's
12173      worth of scalar iterations.  */
12174   bool final_iter_may_be_partial
12175     = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
12176   /* The minimum number of iterations performed by the epilogue.  This
12177      is 1 when peeling for gaps because we always need a final scalar
12178      iteration.  */
12179   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
12180   /* +1 to convert latch counts to loop iteration counts,
12181      -min_epilogue_iters to remove iterations that cannot be performed
12182        by the vector code.  */
12183   int bias_for_lowest = 1 - min_epilogue_iters;
12184   int bias_for_assumed = bias_for_lowest;
12185   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
12186   if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
12187     {
12188       /* When the amount of peeling is known at compile time, the first
12189          iteration will have exactly alignment_npeels active elements.
12190          In the worst case it will have at least one.  */
12191       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
12192       bias_for_lowest += lowest_vf - min_first_active;
12193       bias_for_assumed += assumed_vf - min_first_active;
12194     }
12195   /* In these calculations the "- 1" converts loop iteration counts
12196      back to latch counts.  */
12197   if (loop->any_upper_bound)
12198     {
12199       loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
12200       loop->nb_iterations_upper_bound
12201         = (final_iter_may_be_partial
12202            ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
12203                             lowest_vf) - 1
12204            : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
12205                              lowest_vf) - 1);
12206       if (main_vinfo
12207           /* Both peeling for alignment and peeling for gaps can end up
12208              with the scalar epilogue running for more than VF-1 iterations.  */
12209           && !main_vinfo->peeling_for_alignment
12210           && !main_vinfo->peeling_for_gaps)
12211         {
12212           unsigned int bound;
12213           poly_uint64 main_iters
12214             = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
12215                            LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
12216           main_iters
12217             = upper_bound (main_iters,
12218                            LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
12219           if (can_div_away_from_zero_p (main_iters,
12220                                         LOOP_VINFO_VECT_FACTOR (loop_vinfo),
12221                                         &bound))
12222             loop->nb_iterations_upper_bound
12223               = wi::umin ((bound_wide_int) (bound - 1),
12224                           loop->nb_iterations_upper_bound);
12225       }
12226   }
12227   if (loop->any_likely_upper_bound)
12228     loop->nb_iterations_likely_upper_bound
12229       = (final_iter_may_be_partial
12230          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
12231                           + bias_for_lowest, lowest_vf) - 1
12232          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
12233                            + bias_for_lowest, lowest_vf) - 1);
12234   if (loop->any_estimate)
12235     loop->nb_iterations_estimate
12236       = (final_iter_may_be_partial
12237          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
12238                           assumed_vf) - 1
12239          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
12240                            assumed_vf) - 1);
12241   scale_profile_for_vect_loop (loop, LOOP_VINFO_IV_EXIT (loop_vinfo),
12242                                assumed_vf, flat);
12243
12244   if (dump_enabled_p ())
12245     {
12246       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
12247         {
12248           dump_printf_loc (MSG_NOTE, vect_location,
12249                            "LOOP VECTORIZED\n");
12250           if (loop->inner)
12251             dump_printf_loc (MSG_NOTE, vect_location,
12252                              "OUTER LOOP VECTORIZED\n");
12253           dump_printf (MSG_NOTE, "\n");
12254         }
12255       else
12256         dump_printf_loc (MSG_NOTE, vect_location,
12257                          "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
12258                          GET_MODE_NAME (loop_vinfo->vector_mode));
12259     }
12260
12261   /* Loops vectorized with a variable factor won't benefit from
12262      unrolling/peeling.  */
12263   if (!vf.is_constant ())
12264     {
12265       loop->unroll = 1;
12266       if (dump_enabled_p ())
12267         dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
12268                          " variable-length vectorization factor\n");
12269     }
12270   /* Free SLP instances here because otherwise stmt reference counting
12271      won't work.  */
12272   slp_instance instance;
12273   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
12274     vect_free_slp_instance (instance);
12275   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
12276   /* Clear-up safelen field since its value is invalid after vectorization
12277      since vectorized loop can have loop-carried dependencies.  */
12278   loop->safelen = 0;
12279
12280   if (epilogue)
12281     {
12282       update_epilogue_loop_vinfo (epilogue, advance);
12283
12284       epilogue->simduid = loop->simduid;
12285       epilogue->force_vectorize = loop->force_vectorize;
12286       epilogue->dont_vectorize = false;
12287     }
12288
12289   return epilogue;
12290 }
12291
12292 /* The code below is trying to perform simple optimization - revert
12293    if-conversion for masked stores, i.e. if the mask of a store is zero
12294    do not perform it and all stored value producers also if possible.
12295    For example,
12296      for (i=0; i<n; i++)
12297        if (c[i])
12298         {
12299           p1[i] += 1;
12300           p2[i] = p3[i] +2;
12301         }
12302    this transformation will produce the following semi-hammock:
12303
12304    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
12305      {
12306        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
12307        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
12308        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
12309        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
12310        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
12311        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
12312      }
12313 */
12314
12315 void
12316 optimize_mask_stores (class loop *loop)
12317 {
12318   basic_block *bbs = get_loop_body (loop);
12319   unsigned nbbs = loop->num_nodes;
12320   unsigned i;
12321   basic_block bb;
12322   class loop *bb_loop;
12323   gimple_stmt_iterator gsi;
12324   gimple *stmt;
12325   auto_vec<gimple *> worklist;
12326   auto_purge_vect_location sentinel;
12327
12328   vect_location = find_loop_location (loop);
12329   /* Pick up all masked stores in loop if any.  */
12330   for (i = 0; i < nbbs; i++)
12331     {
12332       bb = bbs[i];
12333       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
12334            gsi_next (&gsi))
12335         {
12336           stmt = gsi_stmt (gsi);
12337           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
12338             worklist.safe_push (stmt);
12339         }
12340     }
12341
12342   free (bbs);
12343   if (worklist.is_empty ())
12344     return;
12345
12346   /* Loop has masked stores.  */
12347   while (!worklist.is_empty ())
12348     {
12349       gimple *last, *last_store;
12350       edge e, efalse;
12351       tree mask;
12352       basic_block store_bb, join_bb;
12353       gimple_stmt_iterator gsi_to;
12354       tree vdef, new_vdef;
12355       gphi *phi;
12356       tree vectype;
12357       tree zero;
12358
12359       last = worklist.pop ();
12360       mask = gimple_call_arg (last, 2);
12361       bb = gimple_bb (last);
12362       /* Create then_bb and if-then structure in CFG, then_bb belongs to
12363          the same loop as if_bb.  It could be different to LOOP when two
12364          level loop-nest is vectorized and mask_store belongs to the inner
12365          one.  */
12366       e = split_block (bb, last);
12367       bb_loop = bb->loop_father;
12368       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
12369       join_bb = e->dest;
12370       store_bb = create_empty_bb (bb);
12371       add_bb_to_loop (store_bb, bb_loop);
12372       e->flags = EDGE_TRUE_VALUE;
12373       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
12374       /* Put STORE_BB to likely part.  */
12375       efalse->probability = profile_probability::likely ();
12376       e->probability = efalse->probability.invert ();
12377       store_bb->count = efalse->count ();
12378       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
12379       if (dom_info_available_p (CDI_DOMINATORS))
12380         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
12381       if (dump_enabled_p ())
12382         dump_printf_loc (MSG_NOTE, vect_location,
12383                          "Create new block %d to sink mask stores.",
12384                          store_bb->index);
12385       /* Create vector comparison with boolean result.  */
12386       vectype = TREE_TYPE (mask);
12387       zero = build_zero_cst (vectype);
12388       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
12389       gsi = gsi_last_bb (bb);
12390       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
12391       /* Create new PHI node for vdef of the last masked store:
12392          .MEM_2 = VDEF <.MEM_1>
12393          will be converted to
12394          .MEM.3 = VDEF <.MEM_1>
12395          and new PHI node will be created in join bb
12396          .MEM_2 = PHI <.MEM_1, .MEM_3>
12397       */
12398       vdef = gimple_vdef (last);
12399       new_vdef = make_ssa_name (gimple_vop (cfun), last);
12400       gimple_set_vdef (last, new_vdef);
12401       phi = create_phi_node (vdef, join_bb);
12402       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
12403
12404       /* Put all masked stores with the same mask to STORE_BB if possible.  */
12405       while (true)
12406         {
12407           gimple_stmt_iterator gsi_from;
12408           gimple *stmt1 = NULL;
12409
12410           /* Move masked store to STORE_BB.  */
12411           last_store = last;
12412           gsi = gsi_for_stmt (last);
12413           gsi_from = gsi;
12414           /* Shift GSI to the previous stmt for further traversal.  */
12415           gsi_prev (&gsi);
12416           gsi_to = gsi_start_bb (store_bb);
12417           gsi_move_before (&gsi_from, &gsi_to);
12418           /* Setup GSI_TO to the non-empty block start.  */
12419           gsi_to = gsi_start_bb (store_bb);
12420           if (dump_enabled_p ())
12421             dump_printf_loc (MSG_NOTE, vect_location,
12422                              "Move stmt to created bb\n%G", last);
12423           /* Move all stored value producers if possible.  */
12424           while (!gsi_end_p (gsi))
12425             {
12426               tree lhs;
12427               imm_use_iterator imm_iter;
12428               use_operand_p use_p;
12429               bool res;
12430
12431               /* Skip debug statements.  */
12432               if (is_gimple_debug (gsi_stmt (gsi)))
12433                 {
12434                   gsi_prev (&gsi);
12435                   continue;
12436                 }
12437               stmt1 = gsi_stmt (gsi);
12438               /* Do not consider statements writing to memory or having
12439                  volatile operand.  */
12440               if (gimple_vdef (stmt1)
12441                   || gimple_has_volatile_ops (stmt1))
12442                 break;
12443               gsi_from = gsi;
12444               gsi_prev (&gsi);
12445               lhs = gimple_get_lhs (stmt1);
12446               if (!lhs)
12447                 break;
12448
12449               /* LHS of vectorized stmt must be SSA_NAME.  */
12450               if (TREE_CODE (lhs) != SSA_NAME)
12451                 break;
12452
12453               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12454                 {
12455                   /* Remove dead scalar statement.  */
12456                   if (has_zero_uses (lhs))
12457                     {
12458                       gsi_remove (&gsi_from, true);
12459                       continue;
12460                     }
12461                 }
12462
12463               /* Check that LHS does not have uses outside of STORE_BB.  */
12464               res = true;
12465               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
12466                 {
12467                   gimple *use_stmt;
12468                   use_stmt = USE_STMT (use_p);
12469                   if (is_gimple_debug (use_stmt))
12470                     continue;
12471                   if (gimple_bb (use_stmt) != store_bb)
12472                     {
12473                       res = false;
12474                       break;
12475                     }
12476                 }
12477               if (!res)
12478                 break;
12479
12480               if (gimple_vuse (stmt1)
12481                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
12482                 break;
12483
12484               /* Can move STMT1 to STORE_BB.  */
12485               if (dump_enabled_p ())
12486                 dump_printf_loc (MSG_NOTE, vect_location,
12487                                  "Move stmt to created bb\n%G", stmt1);
12488               gsi_move_before (&gsi_from, &gsi_to);
12489               /* Shift GSI_TO for further insertion.  */
12490               gsi_prev (&gsi_to);
12491             }
12492           /* Put other masked stores with the same mask to STORE_BB.  */
12493           if (worklist.is_empty ()
12494               || gimple_call_arg (worklist.last (), 2) != mask
12495               || worklist.last () != stmt1)
12496             break;
12497           last = worklist.pop ();
12498         }
12499       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
12500     }
12501 }
12502
12503 /* Decide whether it is possible to use a zero-based induction variable
12504    when vectorizing LOOP_VINFO with partial vectors.  If it is, return
12505    the value that the induction variable must be able to hold in order
12506    to ensure that the rgroups eventually have no active vector elements.
12507    Return -1 otherwise.  */
12508
12509 widest_int
12510 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
12511 {
12512   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
12513   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
12514   unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
12515
12516   /* Calculate the value that the induction variable must be able
12517      to hit in order to ensure that we end the loop with an all-false mask.
12518      This involves adding the maximum number of inactive trailing scalar
12519      iterations.  */
12520   widest_int iv_limit = -1;
12521   if (max_loop_iterations (loop, &iv_limit))
12522     {
12523       if (niters_skip)
12524         {
12525           /* Add the maximum number of skipped iterations to the
12526              maximum iteration count.  */
12527           if (TREE_CODE (niters_skip) == INTEGER_CST)
12528             iv_limit += wi::to_widest (niters_skip);
12529           else
12530             iv_limit += max_vf - 1;
12531         }
12532       else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
12533         /* Make a conservatively-correct assumption.  */
12534         iv_limit += max_vf - 1;
12535
12536       /* IV_LIMIT is the maximum number of latch iterations, which is also
12537          the maximum in-range IV value.  Round this value down to the previous
12538          vector alignment boundary and then add an extra full iteration.  */
12539       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
12540       iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
12541     }
12542   return iv_limit;
12543 }
12544
12545 /* For the given rgroup_controls RGC, check whether an induction variable
12546    would ever hit a value that produces a set of all-false masks or zero
12547    lengths before wrapping around.  Return true if it's possible to wrap
12548    around before hitting the desirable value, otherwise return false.  */
12549
12550 bool
12551 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
12552 {
12553   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
12554
12555   if (iv_limit == -1)
12556     return true;
12557
12558   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
12559   unsigned int compare_precision = TYPE_PRECISION (compare_type);
12560   unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
12561
12562   if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
12563     return true;
12564
12565   return false;
12566 }