gcc/tree-vect-loop.cc

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2023 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #define INCLUDE_ALGORITHM
  23 #include "config.h"
  24 #include "system.h"
  25 #include "coretypes.h"
  26 #include "backend.h"
  27 #include "target.h"
  28 #include "rtl.h"
  29 #include "tree.h"
  30 #include "gimple.h"
  31 #include "cfghooks.h"
  32 #include "tree-pass.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "diagnostic-core.h"
  36 #include "fold-const.h"
  37 #include "stor-layout.h"
  38 #include "cfganal.h"
  39 #include "gimplify.h"
  40 #include "gimple-iterator.h"
  41 #include "gimplify-me.h"
  42 #include "tree-ssa-loop-ivopts.h"
  43 #include "tree-ssa-loop-manip.h"
  44 #include "tree-ssa-loop-niter.h"
  45 #include "tree-ssa-loop.h"
  46 #include "cfgloop.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52 #include "tree-if-conv.h"
  53 #include "internal-fn.h"
  54 #include "tree-vector-builder.h"
  55 #include "vec-perm-indices.h"
  56 #include "tree-eh.h"
  57 #include "case-cfn-macros.h"
  58
  59 /* Loop Vectorization Pass.
  60
  61    This pass tries to vectorize loops.
  62
  63    For example, the vectorizer transforms the following simple loop:
  64
  65         short a[N]; short b[N]; short c[N]; int i;
  66
  67         for (i=0; i<N; i++){
  68           a[i] = b[i] + c[i];
  69         }
  70
  71    as if it was manually vectorized by rewriting the source code into:
  72
  73         typedef int __attribute__((mode(V8HI))) v8hi;
  74         short a[N];  short b[N]; short c[N];   int i;
  75         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  76         v8hi va, vb, vc;
  77
  78         for (i=0; i<N/8; i++){
  79           vb = pb[i];
  80           vc = pc[i];
  81           va = vb + vc;
  82           pa[i] = va;
  83         }
  84
  85         The main entry to this pass is vectorize_loops(), in which
  86    the vectorizer applies a set of analyses on a given set of loops,
  87    followed by the actual vectorization transformation for the loops that
  88    had successfully passed the analysis phase.
  89         Throughout this pass we make a distinction between two types of
  90    data: scalars (which are represented by SSA_NAMES), and memory references
  91    ("data-refs").  These two types of data require different handling both
  92    during analysis and transformation. The types of data-refs that the
  93    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  94    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  95    accesses are required to have a simple (consecutive) access pattern.
  96
  97    Analysis phase:
  98    ===============
  99         The driver for the analysis phase is vect_analyze_loop().
 100    It applies a set of analyses, some of which rely on the scalar evolution
 101    analyzer (scev) developed by Sebastian Pop.
 102
 103         During the analysis phase the vectorizer records some information
 104    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 105    loop, as well as general information about the loop as a whole, which is
 106    recorded in a "loop_vec_info" struct attached to each loop.
 107
 108    Transformation phase:
 109    =====================
 110         The loop transformation phase scans all the stmts in the loop, and
 111    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 112    the loop that needs to be vectorized.  It inserts the vector code sequence
 113    just before the scalar stmt S, and records a pointer to the vector code
 114    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 115    attached to S).  This pointer will be used for the vectorization of following
 116    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 117    otherwise, we rely on dead code elimination for removing it.
 118
 119         For example, say stmt S1 was vectorized into stmt VS1:
 120
 121    VS1: vb = px[i];
 122    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 123    S2:  a = b;
 124
 125    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 126    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 127    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 128    resulting sequence would be:
 129
 130    VS1: vb = px[i];
 131    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 132    VS2: va = vb;
 133    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 134
 135         Operands that are not SSA_NAMEs, are data-refs that appear in
 136    load/store operations (like 'x[i]' in S1), and are handled differently.
 137
 138    Target modeling:
 139    =================
 140         Currently the only target specific information that is used is the
 141    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 142    Targets that can support different sizes of vectors, for now will need
 143    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 144    flexibility will be added in the future.
 145
 146         Since we only vectorize operations which vector form can be
 147    expressed using existing tree codes, to verify that an operation is
 148    supported, the vectorizer checks the relevant optab at the relevant
 149    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 150    the value found is CODE_FOR_nothing, then there's no target support, and
 151    we can't vectorize the stmt.
 152
 153    For additional information on this project see:
 154    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 155 */
 156
 157 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
 158                                                 unsigned *);
 159 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
 160                                                bool *, bool *, bool);
 161
 162 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 163    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 164    may already be set for general statements (not just data refs).  */
 165
 166 static opt_result
 167 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
 168                               bool vectype_maybe_set_p,
 169                               poly_uint64 *vf)
 170 {
 171   gimple *stmt = stmt_info->stmt;
 172
 173   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 174        && !STMT_VINFO_LIVE_P (stmt_info))
 175       || gimple_clobber_p (stmt))
 176     {
 177       if (dump_enabled_p ())
 178         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 179       return opt_result::success ();
 180     }
 181
 182   tree stmt_vectype, nunits_vectype;
 183   opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
 184                                                    &stmt_vectype,
 185                                                    &nunits_vectype);
 186   if (!res)
 187     return res;
 188
 189   if (stmt_vectype)
 190     {
 191       if (STMT_VINFO_VECTYPE (stmt_info))
 192         /* The only case when a vectype had been already set is for stmts
 193            that contain a data ref, or for "pattern-stmts" (stmts generated
 194            by the vectorizer to represent/replace a certain idiom).  */
 195         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 196                      || vectype_maybe_set_p)
 197                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 198       else
 199         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 200     }
 201
 202   if (nunits_vectype)
 203     vect_update_max_nunits (vf, nunits_vectype);
 204
 205   return opt_result::success ();
 206 }
 207
 208 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 209    types of STMT_INFO and all attached pattern statements and update
 210    the vectorization factor VF accordingly.  Return true on success
 211    or false if something prevented vectorization.  */
 212
 213 static opt_result
 214 vect_determine_vf_for_stmt (vec_info *vinfo,
 215                             stmt_vec_info stmt_info, poly_uint64 *vf)
 216 {
 217   if (dump_enabled_p ())
 218     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
 219                      stmt_info->stmt);
 220   opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
 221   if (!res)
 222     return res;
 223
 224   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 225       && STMT_VINFO_RELATED_STMT (stmt_info))
 226     {
 227       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 228       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
 229
 230       /* If a pattern statement has def stmts, analyze them too.  */
 231       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 232            !gsi_end_p (si); gsi_next (&si))
 233         {
 234           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 235           if (dump_enabled_p ())
 236             dump_printf_loc (MSG_NOTE, vect_location,
 237                              "==> examining pattern def stmt: %G",
 238                              def_stmt_info->stmt);
 239           res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
 240           if (!res)
 241             return res;
 242         }
 243
 244       if (dump_enabled_p ())
 245         dump_printf_loc (MSG_NOTE, vect_location,
 246                          "==> examining pattern statement: %G",
 247                          stmt_info->stmt);
 248       res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
 249       if (!res)
 250         return res;
 251     }
 252
 253   return opt_result::success ();
 254 }
 255
 256 /* Function vect_determine_vectorization_factor
 257
 258    Determine the vectorization factor (VF).  VF is the number of data elements
 259    that are operated upon in parallel in a single iteration of the vectorized
 260    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 261    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 262    elements can fit in a single vector register.
 263
 264    We currently support vectorization of loops in which all types operated upon
 265    are of the same size.  Therefore this function currently sets VF according to
 266    the size of the types operated upon, and fails if there are multiple sizes
 267    in the loop.
 268
 269    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 270    original loop:
 271         for (i=0; i<N; i++){
 272           a[i] = b[i] + c[i];
 273         }
 274
 275    vectorized loop:
 276         for (i=0; i<N; i+=VF){
 277           a[i:VF] = b[i:VF] + c[i:VF];
 278         }
 279 */
 280
 281 static opt_result
 282 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 283 {
 284   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 285   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 286   unsigned nbbs = loop->num_nodes;
 287   poly_uint64 vectorization_factor = 1;
 288   tree scalar_type = NULL_TREE;
 289   gphi *phi;
 290   tree vectype;
 291   stmt_vec_info stmt_info;
 292   unsigned i;
 293
 294   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 295
 296   for (i = 0; i < nbbs; i++)
 297     {
 298       basic_block bb = bbs[i];
 299
 300       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 301            gsi_next (&si))
 302         {
 303           phi = si.phi ();
 304           stmt_info = loop_vinfo->lookup_stmt (phi);
 305           if (dump_enabled_p ())
 306             dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
 307                              (gimple *) phi);
 308
 309           gcc_assert (stmt_info);
 310
 311           if (STMT_VINFO_RELEVANT_P (stmt_info)
 312               || STMT_VINFO_LIVE_P (stmt_info))
 313             {
 314               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 315               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 316
 317               if (dump_enabled_p ())
 318                 dump_printf_loc (MSG_NOTE, vect_location,
 319                                  "get vectype for scalar type:  %T\n",
 320                                  scalar_type);
 321
 322               vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 323               if (!vectype)
 324                 return opt_result::failure_at (phi,
 325                                                "not vectorized: unsupported "
 326                                                "data-type %T\n",
 327                                                scalar_type);
 328               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 329
 330               if (dump_enabled_p ())
 331                 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
 332                                  vectype);
 333
 334               if (dump_enabled_p ())
 335                 {
 336                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 337                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 338                   dump_printf (MSG_NOTE, "\n");
 339                 }
 340
 341               vect_update_max_nunits (&vectorization_factor, vectype);
 342             }
 343         }
 344
 345       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 346            gsi_next (&si))
 347         {
 348           if (is_gimple_debug (gsi_stmt (si)))
 349             continue;
 350           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 351           opt_result res
 352             = vect_determine_vf_for_stmt (loop_vinfo,
 353                                           stmt_info, &vectorization_factor);
 354           if (!res)
 355             return res;
 356         }
 357     }
 358
 359   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 360   if (dump_enabled_p ())
 361     {
 362       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 363       dump_dec (MSG_NOTE, vectorization_factor);
 364       dump_printf (MSG_NOTE, "\n");
 365     }
 366
 367   if (known_le (vectorization_factor, 1U))
 368     return opt_result::failure_at (vect_location,
 369                                    "not vectorized: unsupported data-type\n");
 370   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 371   return opt_result::success ();
 372 }
 373
 374
 375 /* Function vect_is_simple_iv_evolution.
 376
 377    FORNOW: A simple evolution of an induction variables in the loop is
 378    considered a polynomial evolution.  */
 379
 380 static bool
 381 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 382                              tree * step)
 383 {
 384   tree init_expr;
 385   tree step_expr;
 386   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 387   basic_block bb;
 388
 389   /* When there is no evolution in this loop, the evolution function
 390      is not "simple".  */
 391   if (evolution_part == NULL_TREE)
 392     return false;
 393
 394   /* When the evolution is a polynomial of degree >= 2
 395      the evolution function is not "simple".  */
 396   if (tree_is_chrec (evolution_part))
 397     return false;
 398
 399   step_expr = evolution_part;
 400   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 401
 402   if (dump_enabled_p ())
 403     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
 404                      step_expr, init_expr);
 405
 406   *init = init_expr;
 407   *step = step_expr;
 408
 409   if (TREE_CODE (step_expr) != INTEGER_CST
 410       && (TREE_CODE (step_expr) != SSA_NAME
 411           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 412               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 413           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 414               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 415                   || !flag_associative_math)))
 416       && (TREE_CODE (step_expr) != REAL_CST
 417           || !flag_associative_math))
 418     {
 419       if (dump_enabled_p ())
 420         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 421                          "step unknown.\n");
 422       return false;
 423     }
 424
 425   return true;
 426 }
 427
 428 /* Function vect_is_nonlinear_iv_evolution
 429
 430    Only support nonlinear induction for integer type
 431    1. neg
 432    2. mul by constant
 433    3. lshift/rshift by constant.
 434
 435    For neg induction, return a fake step as integer -1.  */
 436 static bool
 437 vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
 438                                 gphi* loop_phi_node, tree *init, tree *step)
 439 {
 440   tree init_expr, ev_expr, result, op1, op2;
 441   gimple* def;
 442
 443   if (gimple_phi_num_args (loop_phi_node) != 2)
 444     return false;
 445
 446   init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop));
 447   ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop));
 448
 449   /* Support nonlinear induction only for integer type.  */
 450   if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
 451     return false;
 452
 453   *init = init_expr;
 454   result = PHI_RESULT (loop_phi_node);
 455
 456   if (TREE_CODE (ev_expr) != SSA_NAME
 457       || ((def = SSA_NAME_DEF_STMT (ev_expr)), false)
 458       || !is_gimple_assign (def))
 459     return false;
 460
 461   enum tree_code t_code = gimple_assign_rhs_code (def);
 462   switch (t_code)
 463     {
 464     case NEGATE_EXPR:
 465       if (gimple_assign_rhs1 (def) != result)
 466         return false;
 467       *step = build_int_cst (TREE_TYPE (init_expr), -1);
 468       STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
 469       break;
 470
 471     case RSHIFT_EXPR:
 472     case LSHIFT_EXPR:
 473     case MULT_EXPR:
 474       op1 = gimple_assign_rhs1 (def);
 475       op2 = gimple_assign_rhs2 (def);
 476       if (TREE_CODE (op2) != INTEGER_CST
 477           || op1 != result)
 478         return false;
 479       *step = op2;
 480       if (t_code == LSHIFT_EXPR)
 481         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
 482       else if (t_code == RSHIFT_EXPR)
 483         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
 484       /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul.  */
 485       else
 486         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
 487       break;
 488
 489     default:
 490       return false;
 491     }
 492
 493   STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = *init;
 494   STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = *step;
 495
 496   return true;
 497 }
 498
 499 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
 500    what we are assuming is a double reduction.  For example, given
 501    a structure like this:
 502
 503       outer1:
 504         x_1 = PHI <x_4(outer2), ...>;
 505         ...
 506
 507       inner:
 508         x_2 = PHI <x_1(outer1), ...>;
 509         ...
 510         x_3 = ...;
 511         ...
 512
 513       outer2:
 514         x_4 = PHI <x_3(inner)>;
 515         ...
 516
 517    outer loop analysis would treat x_1 as a double reduction phi and
 518    this function would then return true for x_2.  */
 519
 520 static bool
 521 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
 522 {
 523   use_operand_p use_p;
 524   ssa_op_iter op_iter;
 525   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
 526     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
 527       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
 528         return true;
 529   return false;
 530 }
 531
 532 /* Returns true if Phi is a first-order recurrence. A first-order
 533    recurrence is a non-reduction recurrence relation in which the value of
 534    the recurrence in the current loop iteration equals a value defined in
 535    the previous iteration.  */
 536
 537 static bool
 538 vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop,
 539                                    gphi *phi)
 540 {
 541   /* Ensure the loop latch definition is from within the loop.  */
 542   edge latch = loop_latch_edge (loop);
 543   tree ldef = PHI_ARG_DEF_FROM_EDGE (phi, latch);
 544   if (TREE_CODE (ldef) != SSA_NAME
 545       || SSA_NAME_IS_DEFAULT_DEF (ldef)
 546       || is_a <gphi *> (SSA_NAME_DEF_STMT (ldef))
 547       || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (ldef))))
 548     return false;
 549
 550   tree def = gimple_phi_result (phi);
 551
 552   /* Ensure every use_stmt of the phi node is dominated by the latch
 553      definition.  */
 554   imm_use_iterator imm_iter;
 555   use_operand_p use_p;
 556   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
 557     if (!is_gimple_debug (USE_STMT (use_p))
 558         && (SSA_NAME_DEF_STMT (ldef) == USE_STMT (use_p)
 559             || !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (ldef),
 560                                             USE_STMT (use_p))))
 561       return false;
 562
 563   /* First-order recurrence autovectorization needs shuffle vector.  */
 564   tree scalar_type = TREE_TYPE (def);
 565   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 566   if (!vectype)
 567     return false;
 568
 569   return true;
 570 }
 571
 572 /* Function vect_analyze_scalar_cycles_1.
 573
 574    Examine the cross iteration def-use cycles of scalar variables
 575    in LOOP.  LOOP_VINFO represents the loop that is now being
 576    considered for vectorization (can be LOOP, or an outer-loop
 577    enclosing LOOP).  SLP indicates there will be some subsequent
 578    slp analyses or not.  */
 579
 580 static void
 581 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop,
 582                               bool slp)
 583 {
 584   basic_block bb = loop->header;
 585   tree init, step;
 586   auto_vec<stmt_vec_info, 64> worklist;
 587   gphi_iterator gsi;
 588   bool double_reduc, reduc_chain;
 589
 590   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 591
 592   /* First - identify all inductions.  Reduction detection assumes that all the
 593      inductions have been identified, therefore, this order must not be
 594      changed.  */
 595   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 596     {
 597       gphi *phi = gsi.phi ();
 598       tree access_fn = NULL;
 599       tree def = PHI_RESULT (phi);
 600       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 601
 602       if (dump_enabled_p ())
 603         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
 604                          (gimple *) phi);
 605
 606       /* Skip virtual phi's.  The data dependences that are associated with
 607          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 608       if (virtual_operand_p (def))
 609         continue;
 610
 611       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 612
 613       /* Analyze the evolution function.  */
 614       access_fn = analyze_scalar_evolution (loop, def);
 615       if (access_fn)
 616         {
 617           STRIP_NOPS (access_fn);
 618           if (dump_enabled_p ())
 619             dump_printf_loc (MSG_NOTE, vect_location,
 620                              "Access function of PHI: %T\n", access_fn);
 621           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 622             = initial_condition_in_loop_num (access_fn, loop->num);
 623           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 624             = evolution_part_in_loop_num (access_fn, loop->num);
 625         }
 626
 627       if ((!access_fn
 628            || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
 629            || !vect_is_simple_iv_evolution (loop->num, access_fn,
 630                                             &init, &step)
 631            || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 632                && TREE_CODE (step) != INTEGER_CST))
 633           /* Only handle nonlinear iv for same loop.  */
 634           && (LOOP_VINFO_LOOP (loop_vinfo) != loop
 635               || !vect_is_nonlinear_iv_evolution (loop, stmt_vinfo,
 636                                                   phi, &init, &step)))
 637         {
 638           worklist.safe_push (stmt_vinfo);
 639           continue;
 640         }
 641
 642       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 643                   != NULL_TREE);
 644       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 645
 646       if (dump_enabled_p ())
 647         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 648       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 649     }
 650
 651
 652   /* Second - identify all reductions and nested cycles.  */
 653   while (worklist.length () > 0)
 654     {
 655       stmt_vec_info stmt_vinfo = worklist.pop ();
 656       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
 657       tree def = PHI_RESULT (phi);
 658
 659       if (dump_enabled_p ())
 660         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
 661                          (gimple *) phi);
 662
 663       gcc_assert (!virtual_operand_p (def)
 664                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 665
 666       stmt_vec_info reduc_stmt_info
 667         = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
 668                                     &reduc_chain, slp);
 669       if (reduc_stmt_info)
 670         {
 671           STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
 672           STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
 673           if (double_reduc)
 674             {
 675               if (dump_enabled_p ())
 676                 dump_printf_loc (MSG_NOTE, vect_location,
 677                                  "Detected double reduction.\n");
 678
 679               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 680               STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
 681             }
 682           else
 683             {
 684               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 685                 {
 686                   if (dump_enabled_p ())
 687                     dump_printf_loc (MSG_NOTE, vect_location,
 688                                      "Detected vectorizable nested cycle.\n");
 689
 690                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 691                 }
 692               else
 693                 {
 694                   if (dump_enabled_p ())
 695                     dump_printf_loc (MSG_NOTE, vect_location,
 696                                      "Detected reduction.\n");
 697
 698                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 699                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
 700                   /* Store the reduction cycles for possible vectorization in
 701                      loop-aware SLP if it was not detected as reduction
 702                      chain.  */
 703                   if (! reduc_chain)
 704                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
 705                       (reduc_stmt_info);
 706                 }
 707             }
 708         }
 709       else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi))
 710         STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence;
 711       else
 712         if (dump_enabled_p ())
 713           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 714                            "Unknown def-use cycle pattern.\n");
 715     }
 716 }
 717
 718
 719 /* Function vect_analyze_scalar_cycles.
 720
 721    Examine the cross iteration def-use cycles of scalar variables, by
 722    analyzing the loop-header PHIs of scalar variables.  Classify each
 723    cycle as one of the following: invariant, induction, reduction, unknown.
 724    We do that for the loop represented by LOOP_VINFO, and also to its
 725    inner-loop, if exists.
 726    Examples for scalar cycles:
 727
 728    Example1: reduction:
 729
 730               loop1:
 731               for (i=0; i<N; i++)
 732                  sum += a[i];
 733
 734    Example2: induction:
 735
 736               loop2:
 737               for (i=0; i<N; i++)
 738                  a[i] = i;  */
 739
 740 static void
 741 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo, bool slp)
 742 {
 743   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 744
 745   vect_analyze_scalar_cycles_1 (loop_vinfo, loop, slp);
 746
 747   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 748      Reductions in such inner-loop therefore have different properties than
 749      the reductions in the nest that gets vectorized:
 750      1. When vectorized, they are executed in the same order as in the original
 751         scalar loop, so we can't change the order of computation when
 752         vectorizing them.
 753      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 754         current checks are too strict.  */
 755
 756   if (loop->inner)
 757     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner, slp);
 758 }
 759
 760 /* Transfer group and reduction information from STMT_INFO to its
 761    pattern stmt.  */
 762
 763 static void
 764 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
 765 {
 766   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
 767   stmt_vec_info stmtp;
 768   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
 769               && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
 770   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
 771   do
 772     {
 773       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
 774       gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
 775                            == STMT_VINFO_DEF_TYPE (stmt_info));
 776       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
 777       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
 778       if (stmt_info)
 779         REDUC_GROUP_NEXT_ELEMENT (stmtp)
 780           = STMT_VINFO_RELATED_STMT (stmt_info);
 781     }
 782   while (stmt_info);
 783 }
 784
 785 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 786
 787 static void
 788 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 789 {
 790   stmt_vec_info first;
 791   unsigned i;
 792
 793   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 794     {
 795       stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
 796       while (next)
 797         {
 798           if ((STMT_VINFO_IN_PATTERN_P (next)
 799                != STMT_VINFO_IN_PATTERN_P (first))
 800               || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
 801             break;
 802           next = REDUC_GROUP_NEXT_ELEMENT (next);
 803         }
 804       /* If all reduction chain members are well-formed patterns adjust
 805          the group to group the pattern stmts instead.  */
 806       if (! next
 807           && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
 808         {
 809           if (STMT_VINFO_IN_PATTERN_P (first))
 810             {
 811               vect_fixup_reduc_chain (first);
 812               LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 813                 = STMT_VINFO_RELATED_STMT (first);
 814             }
 815         }
 816       /* If not all stmt in the chain are patterns or if we failed
 817          to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
 818          it as regular reduction instead.  */
 819       else
 820         {
 821           stmt_vec_info vinfo = first;
 822           stmt_vec_info last = NULL;
 823           while (vinfo)
 824             {
 825               next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
 826               REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
 827               REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
 828               last = vinfo;
 829               vinfo = next;
 830             }
 831           STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
 832             = vect_internal_def;
 833           loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
 834           LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
 835           --i;
 836         }
 837     }
 838 }
 839
 840 /* Function vect_get_loop_niters.
 841
 842    Determine how many iterations the loop is executed and place it
 843    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 844    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 845    niter information holds in ASSUMPTIONS.
 846
 847    Return the loop exit condition.  */
 848
 849
 850 static gcond *
 851 vect_get_loop_niters (class loop *loop, tree *assumptions,
 852                       tree *number_of_iterations, tree *number_of_iterationsm1)
 853 {
 854   edge exit = single_exit (loop);
 855   class tree_niter_desc niter_desc;
 856   tree niter_assumptions, niter, may_be_zero;
 857   gcond *cond = get_loop_exit_condition (loop);
 858
 859   *assumptions = boolean_true_node;
 860   *number_of_iterationsm1 = chrec_dont_know;
 861   *number_of_iterations = chrec_dont_know;
 862   DUMP_VECT_SCOPE ("get_loop_niters");
 863
 864   if (!exit)
 865     return cond;
 866
 867   may_be_zero = NULL_TREE;
 868   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 869       || chrec_contains_undetermined (niter_desc.niter))
 870     return cond;
 871
 872   niter_assumptions = niter_desc.assumptions;
 873   may_be_zero = niter_desc.may_be_zero;
 874   niter = niter_desc.niter;
 875
 876   if (may_be_zero && integer_zerop (may_be_zero))
 877     may_be_zero = NULL_TREE;
 878
 879   if (may_be_zero)
 880     {
 881       if (COMPARISON_CLASS_P (may_be_zero))
 882         {
 883           /* Try to combine may_be_zero with assumptions, this can simplify
 884              computation of niter expression.  */
 885           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 886             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 887                                              niter_assumptions,
 888                                              fold_build1 (TRUTH_NOT_EXPR,
 889                                                           boolean_type_node,
 890                                                           may_be_zero));
 891           else
 892             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 893                                  build_int_cst (TREE_TYPE (niter), 0),
 894                                  rewrite_to_non_trapping_overflow (niter));
 895
 896           may_be_zero = NULL_TREE;
 897         }
 898       else if (integer_nonzerop (may_be_zero))
 899         {
 900           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 901           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 902           return cond;
 903         }
 904       else
 905         return cond;
 906     }
 907
 908   *assumptions = niter_assumptions;
 909   *number_of_iterationsm1 = niter;
 910
 911   /* We want the number of loop header executions which is the number
 912      of latch executions plus one.
 913      ???  For UINT_MAX latch executions this number overflows to zero
 914      for loops like do { n++; } while (n != 0);  */
 915   if (niter && !chrec_contains_undetermined (niter))
 916     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
 917                           build_int_cst (TREE_TYPE (niter), 1));
 918   *number_of_iterations = niter;
 919
 920   return cond;
 921 }
 922
 923 /* Function bb_in_loop_p
 924
 925    Used as predicate for dfs order traversal of the loop bbs.  */
 926
 927 static bool
 928 bb_in_loop_p (const_basic_block bb, const void *data)
 929 {
 930   const class loop *const loop = (const class loop *)data;
 931   if (flow_bb_inside_loop_p (loop, bb))
 932     return true;
 933   return false;
 934 }
 935
 936
 937 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
 938    stmt_vec_info structs for all the stmts in LOOP_IN.  */
 939
 940 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
 941   : vec_info (vec_info::loop, shared),
 942     loop (loop_in),
 943     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
 944     num_itersm1 (NULL_TREE),
 945     num_iters (NULL_TREE),
 946     num_iters_unchanged (NULL_TREE),
 947     num_iters_assumptions (NULL_TREE),
 948     vector_costs (nullptr),
 949     scalar_costs (nullptr),
 950     th (0),
 951     versioning_threshold (0),
 952     vectorization_factor (0),
 953     main_loop_edge (nullptr),
 954     skip_main_loop_edge (nullptr),
 955     skip_this_loop_edge (nullptr),
 956     reusable_accumulators (),
 957     suggested_unroll_factor (1),
 958     max_vectorization_factor (0),
 959     mask_skip_niters (NULL_TREE),
 960     rgroup_compare_type (NULL_TREE),
 961     simd_if_cond (NULL_TREE),
 962     unaligned_dr (NULL),
 963     peeling_for_alignment (0),
 964     ptr_mask (0),
 965     ivexpr_map (NULL),
 966     scan_map (NULL),
 967     slp_unrolling_factor (1),
 968     inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
 969     vectorizable (false),
 970     can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
 971     using_partial_vectors_p (false),
 972     epil_using_partial_vectors_p (false),
 973     partial_load_store_bias (0),
 974     peeling_for_gaps (false),
 975     peeling_for_niter (false),
 976     no_data_dependencies (false),
 977     has_mask_store (false),
 978     scalar_loop_scaling (profile_probability::uninitialized ()),
 979     scalar_loop (NULL),
 980     orig_loop_info (NULL)
 981 {
 982   /* CHECKME: We want to visit all BBs before their successors (except for
 983      latch blocks, for which this assertion wouldn't hold).  In the simple
 984      case of the loop forms we allow, a dfs order of the BBs would the same
 985      as reversed postorder traversal, so we are safe.  */
 986
 987   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 988                                           bbs, loop->num_nodes, loop);
 989   gcc_assert (nbbs == loop->num_nodes);
 990
 991   for (unsigned int i = 0; i < nbbs; i++)
 992     {
 993       basic_block bb = bbs[i];
 994       gimple_stmt_iterator si;
 995
 996       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 997         {
 998           gimple *phi = gsi_stmt (si);
 999           gimple_set_uid (phi, 0);
1000           add_stmt (phi);
1001         }
1002
1003       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1004         {
1005           gimple *stmt = gsi_stmt (si);
1006           gimple_set_uid (stmt, 0);
1007           if (is_gimple_debug (stmt))
1008             continue;
1009           add_stmt (stmt);
1010           /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
1011              third argument is the #pragma omp simd if (x) condition, when 0,
1012              loop shouldn't be vectorized, when non-zero constant, it should
1013              be vectorized normally, otherwise versioned with vectorized loop
1014              done if the condition is non-zero at runtime.  */
1015           if (loop_in->simduid
1016               && is_gimple_call (stmt)
1017               && gimple_call_internal_p (stmt)
1018               && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
1019               && gimple_call_num_args (stmt) >= 3
1020               && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
1021               && (loop_in->simduid
1022                   == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
1023             {
1024               tree arg = gimple_call_arg (stmt, 2);
1025               if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
1026                 simd_if_cond = arg;
1027               else
1028                 gcc_assert (integer_nonzerop (arg));
1029             }
1030         }
1031     }
1032
1033   epilogue_vinfos.create (6);
1034 }
1035
1036 /* Free all levels of rgroup CONTROLS.  */
1037
1038 void
1039 release_vec_loop_controls (vec<rgroup_controls> *controls)
1040 {
1041   rgroup_controls *rgc;
1042   unsigned int i;
1043   FOR_EACH_VEC_ELT (*controls, i, rgc)
1044     rgc->controls.release ();
1045   controls->release ();
1046 }
1047
1048 /* Free all memory used by the _loop_vec_info, as well as all the
1049    stmt_vec_info structs of all the stmts in the loop.  */
1050
1051 _loop_vec_info::~_loop_vec_info ()
1052 {
1053   free (bbs);
1054
1055   release_vec_loop_controls (&masks);
1056   release_vec_loop_controls (&lens);
1057   delete ivexpr_map;
1058   delete scan_map;
1059   epilogue_vinfos.release ();
1060   delete scalar_costs;
1061   delete vector_costs;
1062
1063   /* When we release an epiloge vinfo that we do not intend to use
1064      avoid clearing AUX of the main loop which should continue to
1065      point to the main loop vinfo since otherwise we'll leak that.  */
1066   if (loop->aux == this)
1067     loop->aux = NULL;
1068 }
1069
1070 /* Return an invariant or register for EXPR and emit necessary
1071    computations in the LOOP_VINFO loop preheader.  */
1072
1073 tree
1074 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
1075 {
1076   if (is_gimple_reg (expr)
1077       || is_gimple_min_invariant (expr))
1078     return expr;
1079
1080   if (! loop_vinfo->ivexpr_map)
1081     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
1082   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
1083   if (! cached)
1084     {
1085       gimple_seq stmts = NULL;
1086       cached = force_gimple_operand (unshare_expr (expr),
1087                                      &stmts, true, NULL_TREE);
1088       if (stmts)
1089         {
1090           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1091           gsi_insert_seq_on_edge_immediate (e, stmts);
1092         }
1093     }
1094   return cached;
1095 }
1096
1097 /* Return true if we can use CMP_TYPE as the comparison type to produce
1098    all masks required to mask LOOP_VINFO.  */
1099
1100 static bool
1101 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1102 {
1103   rgroup_controls *rgm;
1104   unsigned int i;
1105   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1106     if (rgm->type != NULL_TREE
1107         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1108                                             cmp_type, rgm->type,
1109                                             OPTIMIZE_FOR_SPEED))
1110       return false;
1111   return true;
1112 }
1113
1114 /* Calculate the maximum number of scalars per iteration for every
1115    rgroup in LOOP_VINFO.  */
1116
1117 static unsigned int
1118 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1119 {
1120   unsigned int res = 1;
1121   unsigned int i;
1122   rgroup_controls *rgm;
1123   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1124     res = MAX (res, rgm->max_nscalars_per_iter);
1125   return res;
1126 }
1127
1128 /* Calculate the minimum precision necessary to represent:
1129
1130       MAX_NITERS * FACTOR
1131
1132    as an unsigned integer, where MAX_NITERS is the maximum number of
1133    loop header iterations for the original scalar form of LOOP_VINFO.  */
1134
1135 static unsigned
1136 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1137 {
1138   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1139
1140   /* Get the maximum number of iterations that is representable
1141      in the counter type.  */
1142   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1143   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1144
1145   /* Get a more refined estimate for the number of iterations.  */
1146   widest_int max_back_edges;
1147   if (max_loop_iterations (loop, &max_back_edges))
1148     max_ni = wi::smin (max_ni, max_back_edges + 1);
1149
1150   /* Work out how many bits we need to represent the limit.  */
1151   return wi::min_precision (max_ni * factor, UNSIGNED);
1152 }
1153
1154 /* True if the loop needs peeling or partial vectors when vectorized.  */
1155
1156 static bool
1157 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1158 {
1159   unsigned HOST_WIDE_INT const_vf;
1160   HOST_WIDE_INT max_niter
1161     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1162
1163   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1164   if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1165     th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1166                                           (loop_vinfo));
1167
1168   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1169       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1170     {
1171       /* Work out the (constant) number of iterations that need to be
1172          peeled for reasons other than niters.  */
1173       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1174       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1175         peel_niter += 1;
1176       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1177                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1178         return true;
1179     }
1180   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1181       /* ??? When peeling for gaps but not alignment, we could
1182          try to check whether the (variable) niters is known to be
1183          VF * N + 1.  That's something of a niche case though.  */
1184       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1185       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1186       || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1187            < (unsigned) exact_log2 (const_vf))
1188           /* In case of versioning, check if the maximum number of
1189              iterations is greater than th.  If they are identical,
1190              the epilogue is unnecessary.  */
1191           && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1192               || ((unsigned HOST_WIDE_INT) max_niter
1193                   > (th / const_vf) * const_vf))))
1194     return true;
1195
1196   return false;
1197 }
1198
1199 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1200    whether we can actually generate the masks required.  Return true if so,
1201    storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE.  */
1202
1203 static bool
1204 vect_verify_full_masking (loop_vec_info loop_vinfo)
1205 {
1206   unsigned int min_ni_width;
1207   unsigned int max_nscalars_per_iter
1208     = vect_get_max_nscalars_per_iter (loop_vinfo);
1209
1210   /* Use a normal loop if there are no statements that need masking.
1211      This only happens in rare degenerate cases: it means that the loop
1212      has no loads, no stores, and no live-out values.  */
1213   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1214     return false;
1215
1216   /* Work out how many bits we need to represent the limit.  */
1217   min_ni_width
1218     = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1219
1220   /* Find a scalar mode for which WHILE_ULT is supported.  */
1221   opt_scalar_int_mode cmp_mode_iter;
1222   tree cmp_type = NULL_TREE;
1223   tree iv_type = NULL_TREE;
1224   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1225   unsigned int iv_precision = UINT_MAX;
1226
1227   if (iv_limit != -1)
1228     iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1229                                       UNSIGNED);
1230
1231   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1232     {
1233       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1234       if (cmp_bits >= min_ni_width
1235           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1236         {
1237           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1238           if (this_type
1239               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1240             {
1241               /* Although we could stop as soon as we find a valid mode,
1242                  there are at least two reasons why that's not always the
1243                  best choice:
1244
1245                  - An IV that's Pmode or wider is more likely to be reusable
1246                    in address calculations than an IV that's narrower than
1247                    Pmode.
1248
1249                  - Doing the comparison in IV_PRECISION or wider allows
1250                    a natural 0-based IV, whereas using a narrower comparison
1251                    type requires mitigations against wrap-around.
1252
1253                  Conversely, if the IV limit is variable, doing the comparison
1254                  in a wider type than the original type can introduce
1255                  unnecessary extensions, so picking the widest valid mode
1256                  is not always a good choice either.
1257
1258                  Here we prefer the first IV type that's Pmode or wider,
1259                  and the first comparison type that's IV_PRECISION or wider.
1260                  (The comparison type must be no wider than the IV type,
1261                  to avoid extensions in the vector loop.)
1262
1263                  ??? We might want to try continuing beyond Pmode for ILP32
1264                  targets if CMP_BITS < IV_PRECISION.  */
1265               iv_type = this_type;
1266               if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1267                 cmp_type = this_type;
1268               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1269                 break;
1270             }
1271         }
1272     }
1273
1274   if (!cmp_type)
1275     return false;
1276
1277   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1278   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1279   return true;
1280 }
1281
1282 /* Check whether we can use vector access with length based on precison
1283    comparison.  So far, to keep it simple, we only allow the case that the
1284    precision of the target supported length is larger than the precision
1285    required by loop niters.  */
1286
1287 static bool
1288 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1289 {
1290   if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1291     return false;
1292
1293   machine_mode len_load_mode = get_len_load_store_mode
1294     (loop_vinfo->vector_mode, true).require ();
1295   machine_mode len_store_mode = get_len_load_store_mode
1296     (loop_vinfo->vector_mode, false).require ();
1297
1298   signed char partial_load_bias = internal_len_load_store_bias
1299     (IFN_LEN_LOAD, len_load_mode);
1300
1301   signed char partial_store_bias = internal_len_load_store_bias
1302     (IFN_LEN_STORE, len_store_mode);
1303
1304   gcc_assert (partial_load_bias == partial_store_bias);
1305
1306   if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1307     return false;
1308
1309   /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1310      len_loads with a length of zero.  In order to avoid that we prohibit
1311      more than one loop length here.  */
1312   if (partial_load_bias == -1
1313       && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1314     return false;
1315
1316   LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1317
1318   unsigned int max_nitems_per_iter = 1;
1319   unsigned int i;
1320   rgroup_controls *rgl;
1321   /* Find the maximum number of items per iteration for every rgroup.  */
1322   FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1323     {
1324       unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1325       max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1326     }
1327
1328   /* Work out how many bits we need to represent the length limit.  */
1329   unsigned int min_ni_prec
1330     = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1331
1332   /* Now use the maximum of below precisions for one suitable IV type:
1333      - the IV's natural precision
1334      - the precision needed to hold: the maximum number of scalar
1335        iterations multiplied by the scale factor (min_ni_prec above)
1336      - the Pmode precision
1337
1338      If min_ni_prec is less than the precision of the current niters,
1339      we perfer to still use the niters type.  Prefer to use Pmode and
1340      wider IV to avoid narrow conversions.  */
1341
1342   unsigned int ni_prec
1343     = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1344   min_ni_prec = MAX (min_ni_prec, ni_prec);
1345   min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1346
1347   tree iv_type = NULL_TREE;
1348   opt_scalar_int_mode tmode_iter;
1349   FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1350     {
1351       scalar_mode tmode = tmode_iter.require ();
1352       unsigned int tbits = GET_MODE_BITSIZE (tmode);
1353
1354       /* ??? Do we really want to construct one IV whose precision exceeds
1355          BITS_PER_WORD?  */
1356       if (tbits > BITS_PER_WORD)
1357         break;
1358
1359       /* Find the first available standard integral type.  */
1360       if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1361         {
1362           iv_type = build_nonstandard_integer_type (tbits, true);
1363           break;
1364         }
1365     }
1366
1367   if (!iv_type)
1368     {
1369       if (dump_enabled_p ())
1370         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1371                          "can't vectorize with length-based partial vectors"
1372                          " because there is no suitable iv type.\n");
1373       return false;
1374     }
1375
1376   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1377   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1378
1379   return true;
1380 }
1381
1382 /* Calculate the cost of one scalar iteration of the loop.  */
1383 static void
1384 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1385 {
1386   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1387   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1388   int nbbs = loop->num_nodes, factor;
1389   int innerloop_iters, i;
1390
1391   DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1392
1393   /* Gather costs for statements in the scalar loop.  */
1394
1395   /* FORNOW.  */
1396   innerloop_iters = 1;
1397   if (loop->inner)
1398     innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1399
1400   for (i = 0; i < nbbs; i++)
1401     {
1402       gimple_stmt_iterator si;
1403       basic_block bb = bbs[i];
1404
1405       if (bb->loop_father == loop->inner)
1406         factor = innerloop_iters;
1407       else
1408         factor = 1;
1409
1410       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1411         {
1412           gimple *stmt = gsi_stmt (si);
1413           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1414
1415           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1416             continue;
1417
1418           /* Skip stmts that are not vectorized inside the loop.  */
1419           stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1420           if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1421               && (!STMT_VINFO_LIVE_P (vstmt_info)
1422                   || !VECTORIZABLE_CYCLE_DEF
1423                         (STMT_VINFO_DEF_TYPE (vstmt_info))))
1424             continue;
1425
1426           vect_cost_for_stmt kind;
1427           if (STMT_VINFO_DATA_REF (stmt_info))
1428             {
1429               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1430                kind = scalar_load;
1431              else
1432                kind = scalar_store;
1433             }
1434           else if (vect_nop_conversion_p (stmt_info))
1435             continue;
1436           else
1437             kind = scalar_stmt;
1438
1439           /* We are using vect_prologue here to avoid scaling twice
1440              by the inner loop factor.  */
1441           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1442                             factor, kind, stmt_info, 0, vect_prologue);
1443         }
1444     }
1445
1446   /* Now accumulate cost.  */
1447   loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1448   add_stmt_costs (loop_vinfo->scalar_costs,
1449                   &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1450   loop_vinfo->scalar_costs->finish_cost (nullptr);
1451 }
1452
1453
1454 /* Function vect_analyze_loop_form.
1455
1456    Verify that certain CFG restrictions hold, including:
1457    - the loop has a pre-header
1458    - the loop has a single entry and exit
1459    - the loop exit condition is simple enough
1460    - the number of iterations can be analyzed, i.e, a countable loop.  The
1461      niter could be analyzed under some assumptions.  */
1462
1463 opt_result
1464 vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info)
1465 {
1466   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1467
1468   /* Different restrictions apply when we are considering an inner-most loop,
1469      vs. an outer (nested) loop.
1470      (FORNOW. May want to relax some of these restrictions in the future).  */
1471
1472   info->inner_loop_cond = NULL;
1473   if (!loop->inner)
1474     {
1475       /* Inner-most loop.  We currently require that the number of BBs is
1476          exactly 2 (the header and latch).  Vectorizable inner-most loops
1477          look like this:
1478
1479                         (pre-header)
1480                            |
1481                           header <--------+
1482                            | |            |
1483                            | +--> latch --+
1484                            |
1485                         (exit-bb)  */
1486
1487       if (loop->num_nodes != 2)
1488         return opt_result::failure_at (vect_location,
1489                                        "not vectorized:"
1490                                        " control flow in loop.\n");
1491
1492       if (empty_block_p (loop->header))
1493         return opt_result::failure_at (vect_location,
1494                                        "not vectorized: empty loop.\n");
1495     }
1496   else
1497     {
1498       class loop *innerloop = loop->inner;
1499       edge entryedge;
1500
1501       /* Nested loop. We currently require that the loop is doubly-nested,
1502          contains a single inner loop, and the number of BBs is exactly 5.
1503          Vectorizable outer-loops look like this:
1504
1505                         (pre-header)
1506                            |
1507                           header <---+
1508                            |         |
1509                           inner-loop |
1510                            |         |
1511                           tail ------+
1512                            |
1513                         (exit-bb)
1514
1515          The inner-loop has the properties expected of inner-most loops
1516          as described above.  */
1517
1518       if ((loop->inner)->inner || (loop->inner)->next)
1519         return opt_result::failure_at (vect_location,
1520                                        "not vectorized:"
1521                                        " multiple nested loops.\n");
1522
1523       if (loop->num_nodes != 5)
1524         return opt_result::failure_at (vect_location,
1525                                        "not vectorized:"
1526                                        " control flow in loop.\n");
1527
1528       entryedge = loop_preheader_edge (innerloop);
1529       if (entryedge->src != loop->header
1530           || !single_exit (innerloop)
1531           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1532         return opt_result::failure_at (vect_location,
1533                                        "not vectorized:"
1534                                        " unsupported outerloop form.\n");
1535
1536       /* Analyze the inner-loop.  */
1537       vect_loop_form_info inner;
1538       opt_result res = vect_analyze_loop_form (loop->inner, &inner);
1539       if (!res)
1540         {
1541           if (dump_enabled_p ())
1542             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1543                              "not vectorized: Bad inner loop.\n");
1544           return res;
1545         }
1546
1547       /* Don't support analyzing niter under assumptions for inner
1548          loop.  */
1549       if (!integer_onep (inner.assumptions))
1550         return opt_result::failure_at (vect_location,
1551                                        "not vectorized: Bad inner loop.\n");
1552
1553       if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1554         return opt_result::failure_at (vect_location,
1555                                        "not vectorized: inner-loop count not"
1556                                        " invariant.\n");
1557
1558       if (dump_enabled_p ())
1559         dump_printf_loc (MSG_NOTE, vect_location,
1560                          "Considering outer-loop vectorization.\n");
1561       info->inner_loop_cond = inner.loop_cond;
1562     }
1563
1564   if (!single_exit (loop))
1565     return opt_result::failure_at (vect_location,
1566                                    "not vectorized: multiple exits.\n");
1567   if (EDGE_COUNT (loop->header->preds) != 2)
1568     return opt_result::failure_at (vect_location,
1569                                    "not vectorized:"
1570                                    " too many incoming edges.\n");
1571
1572   /* We assume that the loop exit condition is at the end of the loop. i.e,
1573      that the loop is represented as a do-while (with a proper if-guard
1574      before the loop if needed), where the loop header contains all the
1575      executable statements, and the latch is empty.  */
1576   if (!empty_block_p (loop->latch)
1577       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1578     return opt_result::failure_at (vect_location,
1579                                    "not vectorized: latch block not empty.\n");
1580
1581   /* Make sure the exit is not abnormal.  */
1582   edge e = single_exit (loop);
1583   if (e->flags & EDGE_ABNORMAL)
1584     return opt_result::failure_at (vect_location,
1585                                    "not vectorized:"
1586                                    " abnormal loop exit edge.\n");
1587
1588   info->loop_cond
1589     = vect_get_loop_niters (loop, &info->assumptions,
1590                             &info->number_of_iterations,
1591                             &info->number_of_iterationsm1);
1592   if (!info->loop_cond)
1593     return opt_result::failure_at
1594       (vect_location,
1595        "not vectorized: complicated exit condition.\n");
1596
1597   if (integer_zerop (info->assumptions)
1598       || !info->number_of_iterations
1599       || chrec_contains_undetermined (info->number_of_iterations))
1600     return opt_result::failure_at
1601       (info->loop_cond,
1602        "not vectorized: number of iterations cannot be computed.\n");
1603
1604   if (integer_zerop (info->number_of_iterations))
1605     return opt_result::failure_at
1606       (info->loop_cond,
1607        "not vectorized: number of iterations = 0.\n");
1608
1609   if (!(tree_fits_shwi_p (info->number_of_iterations)
1610         && tree_to_shwi (info->number_of_iterations) > 0))
1611     {
1612       if (dump_enabled_p ())
1613         {
1614           dump_printf_loc (MSG_NOTE, vect_location,
1615                            "Symbolic number of iterations is ");
1616           dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1617           dump_printf (MSG_NOTE, "\n");
1618         }
1619     }
1620
1621   return opt_result::success ();
1622 }
1623
1624 /* Create a loop_vec_info for LOOP with SHARED and the
1625    vect_analyze_loop_form result.  */
1626
1627 loop_vec_info
1628 vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1629                         const vect_loop_form_info *info,
1630                         loop_vec_info main_loop_info)
1631 {
1632   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1633   LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1634   LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1635   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1636   LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = main_loop_info;
1637   /* Also record the assumptions for versioning.  */
1638   if (!integer_onep (info->assumptions) && !main_loop_info)
1639     LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1640
1641   stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (info->loop_cond);
1642   STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1643   if (info->inner_loop_cond)
1644     {
1645       stmt_vec_info inner_loop_cond_info
1646         = loop_vinfo->lookup_stmt (info->inner_loop_cond);
1647       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1648       /* If we have an estimate on the number of iterations of the inner
1649          loop use that to limit the scale for costing, otherwise use
1650          --param vect-inner-loop-cost-factor literally.  */
1651       widest_int nit;
1652       if (estimated_stmt_executions (loop->inner, &nit))
1653         LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1654           = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1655     }
1656
1657   return loop_vinfo;
1658 }
1659
1660
1661
1662 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1663    statements update the vectorization factor.  */
1664
1665 static void
1666 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1667 {
1668   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1669   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1670   int nbbs = loop->num_nodes;
1671   poly_uint64 vectorization_factor;
1672   int i;
1673
1674   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1675
1676   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1677   gcc_assert (known_ne (vectorization_factor, 0U));
1678
1679   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1680      vectorization factor of the loop is the unrolling factor required by
1681      the SLP instances.  If that unrolling factor is 1, we say, that we
1682      perform pure SLP on loop - cross iteration parallelism is not
1683      exploited.  */
1684   bool only_slp_in_loop = true;
1685   for (i = 0; i < nbbs; i++)
1686     {
1687       basic_block bb = bbs[i];
1688       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1689            gsi_next (&si))
1690         {
1691           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1692           if (!stmt_info)
1693             continue;
1694           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1695                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1696               && !PURE_SLP_STMT (stmt_info))
1697             /* STMT needs both SLP and loop-based vectorization.  */
1698             only_slp_in_loop = false;
1699         }
1700       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1701            gsi_next (&si))
1702         {
1703           if (is_gimple_debug (gsi_stmt (si)))
1704             continue;
1705           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1706           stmt_info = vect_stmt_to_vectorize (stmt_info);
1707           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1708                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1709               && !PURE_SLP_STMT (stmt_info))
1710             /* STMT needs both SLP and loop-based vectorization.  */
1711             only_slp_in_loop = false;
1712         }
1713     }
1714
1715   if (only_slp_in_loop)
1716     {
1717       if (dump_enabled_p ())
1718         dump_printf_loc (MSG_NOTE, vect_location,
1719                          "Loop contains only SLP stmts\n");
1720       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1721     }
1722   else
1723     {
1724       if (dump_enabled_p ())
1725         dump_printf_loc (MSG_NOTE, vect_location,
1726                          "Loop contains SLP and non-SLP stmts\n");
1727       /* Both the vectorization factor and unroll factor have the form
1728          GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1729          so they must have a common multiple.  */
1730       vectorization_factor
1731         = force_common_multiple (vectorization_factor,
1732                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1733     }
1734
1735   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1736   if (dump_enabled_p ())
1737     {
1738       dump_printf_loc (MSG_NOTE, vect_location,
1739                        "Updating vectorization factor to ");
1740       dump_dec (MSG_NOTE, vectorization_factor);
1741       dump_printf (MSG_NOTE, ".\n");
1742     }
1743 }
1744
1745 /* Return true if STMT_INFO describes a double reduction phi and if
1746    the other phi in the reduction is also relevant for vectorization.
1747    This rejects cases such as:
1748
1749       outer1:
1750         x_1 = PHI <x_3(outer2), ...>;
1751         ...
1752
1753       inner:
1754         x_2 = ...;
1755         ...
1756
1757       outer2:
1758         x_3 = PHI <x_2(inner)>;
1759
1760    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1761
1762 static bool
1763 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1764 {
1765   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1766     return false;
1767
1768   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1769 }
1770
1771 /* Function vect_analyze_loop_operations.
1772
1773    Scan the loop stmts and make sure they are all vectorizable.  */
1774
1775 static opt_result
1776 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1777 {
1778   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1779   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1780   int nbbs = loop->num_nodes;
1781   int i;
1782   stmt_vec_info stmt_info;
1783   bool need_to_vectorize = false;
1784   bool ok;
1785
1786   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1787
1788   auto_vec<stmt_info_for_cost> cost_vec;
1789
1790   for (i = 0; i < nbbs; i++)
1791     {
1792       basic_block bb = bbs[i];
1793
1794       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1795            gsi_next (&si))
1796         {
1797           gphi *phi = si.phi ();
1798           ok = true;
1799
1800           stmt_info = loop_vinfo->lookup_stmt (phi);
1801           if (dump_enabled_p ())
1802             dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G",
1803                              (gimple *) phi);
1804           if (virtual_operand_p (gimple_phi_result (phi)))
1805             continue;
1806
1807           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1808              (i.e., a phi in the tail of the outer-loop).  */
1809           if (! is_loop_header_bb_p (bb))
1810             {
1811               /* FORNOW: we currently don't support the case that these phis
1812                  are not used in the outerloop (unless it is double reduction,
1813                  i.e., this phi is vect_reduction_def), cause this case
1814                  requires to actually do something here.  */
1815               if (STMT_VINFO_LIVE_P (stmt_info)
1816                   && !vect_active_double_reduction_p (stmt_info))
1817                 return opt_result::failure_at (phi,
1818                                                "Unsupported loop-closed phi"
1819                                                " in outer-loop.\n");
1820
1821               /* If PHI is used in the outer loop, we check that its operand
1822                  is defined in the inner loop.  */
1823               if (STMT_VINFO_RELEVANT_P (stmt_info))
1824                 {
1825                   tree phi_op;
1826
1827                   if (gimple_phi_num_args (phi) != 1)
1828                     return opt_result::failure_at (phi, "unsupported phi");
1829
1830                   phi_op = PHI_ARG_DEF (phi, 0);
1831                   stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1832                   if (!op_def_info)
1833                     return opt_result::failure_at (phi, "unsupported phi\n");
1834
1835                   if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1836                       && (STMT_VINFO_RELEVANT (op_def_info)
1837                           != vect_used_in_outer_by_reduction))
1838                     return opt_result::failure_at (phi, "unsupported phi\n");
1839
1840                   if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1841                        || (STMT_VINFO_DEF_TYPE (stmt_info)
1842                            == vect_double_reduction_def))
1843                       && !vectorizable_lc_phi (loop_vinfo,
1844                                                stmt_info, NULL, NULL))
1845                     return opt_result::failure_at (phi, "unsupported phi\n");
1846                 }
1847
1848               continue;
1849             }
1850
1851           gcc_assert (stmt_info);
1852
1853           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1854                || STMT_VINFO_LIVE_P (stmt_info))
1855               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def
1856               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
1857             /* A scalar-dependence cycle that we don't support.  */
1858             return opt_result::failure_at (phi,
1859                                            "not vectorized:"
1860                                            " scalar dependence cycle.\n");
1861
1862           if (STMT_VINFO_RELEVANT_P (stmt_info))
1863             {
1864               need_to_vectorize = true;
1865               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1866                   && ! PURE_SLP_STMT (stmt_info))
1867                 ok = vectorizable_induction (loop_vinfo,
1868                                              stmt_info, NULL, NULL,
1869                                              &cost_vec);
1870               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1871                         || (STMT_VINFO_DEF_TYPE (stmt_info)
1872                             == vect_double_reduction_def)
1873                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1874                        && ! PURE_SLP_STMT (stmt_info))
1875                 ok = vectorizable_reduction (loop_vinfo,
1876                                              stmt_info, NULL, NULL, &cost_vec);
1877               else if ((STMT_VINFO_DEF_TYPE (stmt_info)
1878                         == vect_first_order_recurrence)
1879                        && ! PURE_SLP_STMT (stmt_info))
1880                 ok = vectorizable_recurr (loop_vinfo, stmt_info, NULL, NULL,
1881                                            &cost_vec);
1882             }
1883
1884           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1885           if (ok
1886               && STMT_VINFO_LIVE_P (stmt_info)
1887               && !PURE_SLP_STMT (stmt_info))
1888             ok = vectorizable_live_operation (loop_vinfo,
1889                                               stmt_info, NULL, NULL, NULL,
1890                                               -1, false, &cost_vec);
1891
1892           if (!ok)
1893             return opt_result::failure_at (phi,
1894                                            "not vectorized: relevant phi not "
1895                                            "supported: %G",
1896                                            static_cast <gimple *> (phi));
1897         }
1898
1899       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1900            gsi_next (&si))
1901         {
1902           gimple *stmt = gsi_stmt (si);
1903           if (!gimple_clobber_p (stmt)
1904               && !is_gimple_debug (stmt))
1905             {
1906               opt_result res
1907                 = vect_analyze_stmt (loop_vinfo,
1908                                      loop_vinfo->lookup_stmt (stmt),
1909                                      &need_to_vectorize,
1910                                      NULL, NULL, &cost_vec);
1911               if (!res)
1912                 return res;
1913             }
1914         }
1915     } /* bbs */
1916
1917   add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
1918
1919   /* All operations in the loop are either irrelevant (deal with loop
1920      control, or dead), or only used outside the loop and can be moved
1921      out of the loop (e.g. invariants, inductions).  The loop can be
1922      optimized away by scalar optimizations.  We're better off not
1923      touching this loop.  */
1924   if (!need_to_vectorize)
1925     {
1926       if (dump_enabled_p ())
1927         dump_printf_loc (MSG_NOTE, vect_location,
1928                          "All the computation can be taken out of the loop.\n");
1929       return opt_result::failure_at
1930         (vect_location,
1931          "not vectorized: redundant loop. no profit to vectorize.\n");
1932     }
1933
1934   return opt_result::success ();
1935 }
1936
1937 /* Return true if we know that the iteration count is smaller than the
1938    vectorization factor.  Return false if it isn't, or if we can't be sure
1939    either way.  */
1940
1941 static bool
1942 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
1943 {
1944   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1945
1946   HOST_WIDE_INT max_niter;
1947   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1948     max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1949   else
1950     max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1951
1952   if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1953     return true;
1954
1955   return false;
1956 }
1957
1958 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1959    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1960    definitely no, or -1 if it's worth retrying.  */
1961
1962 static int
1963 vect_analyze_loop_costing (loop_vec_info loop_vinfo,
1964                            unsigned *suggested_unroll_factor)
1965 {
1966   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1967   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1968
1969   /* Only loops that can handle partially-populated vectors can have iteration
1970      counts less than the vectorization factor.  */
1971   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1972     {
1973       if (vect_known_niters_smaller_than_vf (loop_vinfo))
1974         {
1975           if (dump_enabled_p ())
1976             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1977                              "not vectorized: iteration count smaller than "
1978                              "vectorization factor.\n");
1979           return 0;
1980         }
1981     }
1982
1983   /* If using the "very cheap" model. reject cases in which we'd keep
1984      a copy of the scalar code (even if we might be able to vectorize it).  */
1985   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
1986       && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1987           || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1988           || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
1989     {
1990       if (dump_enabled_p ())
1991         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1992                          "some scalar iterations would need to be peeled\n");
1993       return 0;
1994     }
1995
1996   int min_profitable_iters, min_profitable_estimate;
1997   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1998                                       &min_profitable_estimate,
1999                                       suggested_unroll_factor);
2000
2001   if (min_profitable_iters < 0)
2002     {
2003       if (dump_enabled_p ())
2004         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2005                          "not vectorized: vectorization not profitable.\n");
2006       if (dump_enabled_p ())
2007         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2008                          "not vectorized: vector version will never be "
2009                          "profitable.\n");
2010       return -1;
2011     }
2012
2013   int min_scalar_loop_bound = (param_min_vect_loop_bound
2014                                * assumed_vf);
2015
2016   /* Use the cost model only if it is more conservative than user specified
2017      threshold.  */
2018   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
2019                                     min_profitable_iters);
2020
2021   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2022
2023   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2024       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2025     {
2026       if (dump_enabled_p ())
2027         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2028                          "not vectorized: vectorization not profitable.\n");
2029       if (dump_enabled_p ())
2030         dump_printf_loc (MSG_NOTE, vect_location,
2031                          "not vectorized: iteration count smaller than user "
2032                          "specified loop bound parameter or minimum profitable "
2033                          "iterations (whichever is more conservative).\n");
2034       return 0;
2035     }
2036
2037   /* The static profitablity threshold min_profitable_estimate includes
2038      the cost of having to check at runtime whether the scalar loop
2039      should be used instead.  If it turns out that we don't need or want
2040      such a check, the threshold we should use for the static estimate
2041      is simply the point at which the vector loop becomes more profitable
2042      than the scalar loop.  */
2043   if (min_profitable_estimate > min_profitable_iters
2044       && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
2045       && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2046       && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2047       && !vect_apply_runtime_profitability_check_p (loop_vinfo))
2048     {
2049       if (dump_enabled_p ())
2050         dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
2051                          " choice between the scalar and vector loops\n");
2052       min_profitable_estimate = min_profitable_iters;
2053     }
2054
2055   /* If the vector loop needs multiple iterations to be beneficial then
2056      things are probably too close to call, and the conservative thing
2057      would be to stick with the scalar code.  */
2058   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2059       && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
2060     {
2061       if (dump_enabled_p ())
2062         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2063                          "one iteration of the vector loop would be"
2064                          " more expensive than the equivalent number of"
2065                          " iterations of the scalar loop\n");
2066       return 0;
2067     }
2068
2069   HOST_WIDE_INT estimated_niter;
2070
2071   /* If we are vectorizing an epilogue then we know the maximum number of
2072      scalar iterations it will cover is at least one lower than the
2073      vectorization factor of the main loop.  */
2074   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2075     estimated_niter
2076       = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
2077   else
2078     {
2079       estimated_niter = estimated_stmt_executions_int (loop);
2080       if (estimated_niter == -1)
2081         estimated_niter = likely_max_stmt_executions_int (loop);
2082     }
2083   if (estimated_niter != -1
2084       && ((unsigned HOST_WIDE_INT) estimated_niter
2085           < MAX (th, (unsigned) min_profitable_estimate)))
2086     {
2087       if (dump_enabled_p ())
2088         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2089                          "not vectorized: estimated iteration count too "
2090                          "small.\n");
2091       if (dump_enabled_p ())
2092         dump_printf_loc (MSG_NOTE, vect_location,
2093                          "not vectorized: estimated iteration count smaller "
2094                          "than specified loop bound parameter or minimum "
2095                          "profitable iterations (whichever is more "
2096                          "conservative).\n");
2097       return -1;
2098     }
2099
2100   return 1;
2101 }
2102
2103 static opt_result
2104 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
2105                            vec<data_reference_p> *datarefs,
2106                            unsigned int *n_stmts)
2107 {
2108   *n_stmts = 0;
2109   for (unsigned i = 0; i < loop->num_nodes; i++)
2110     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2111          !gsi_end_p (gsi); gsi_next (&gsi))
2112       {
2113         gimple *stmt = gsi_stmt (gsi);
2114         if (is_gimple_debug (stmt))
2115           continue;
2116         ++(*n_stmts);
2117         opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
2118                                                         NULL, 0);
2119         if (!res)
2120           {
2121             if (is_gimple_call (stmt) && loop->safelen)
2122               {
2123                 tree fndecl = gimple_call_fndecl (stmt), op;
2124                 if (fndecl != NULL_TREE)
2125                   {
2126                     cgraph_node *node = cgraph_node::get (fndecl);
2127                     if (node != NULL && node->simd_clones != NULL)
2128                       {
2129                         unsigned int j, n = gimple_call_num_args (stmt);
2130                         for (j = 0; j < n; j++)
2131                           {
2132                             op = gimple_call_arg (stmt, j);
2133                             if (DECL_P (op)
2134                                 || (REFERENCE_CLASS_P (op)
2135                                     && get_base_address (op)))
2136                               break;
2137                           }
2138                         op = gimple_call_lhs (stmt);
2139                         /* Ignore #pragma omp declare simd functions
2140                            if they don't have data references in the
2141                            call stmt itself.  */
2142                         if (j == n
2143                             && !(op
2144                                  && (DECL_P (op)
2145                                      || (REFERENCE_CLASS_P (op)
2146                                          && get_base_address (op)))))
2147                           continue;
2148                       }
2149                   }
2150               }
2151             return res;
2152           }
2153         /* If dependence analysis will give up due to the limit on the
2154            number of datarefs stop here and fail fatally.  */
2155         if (datarefs->length ()
2156             > (unsigned)param_loop_max_datarefs_for_datadeps)
2157           return opt_result::failure_at (stmt, "exceeded param "
2158                                          "loop-max-datarefs-for-datadeps\n");
2159       }
2160   return opt_result::success ();
2161 }
2162
2163 /* Look for SLP-only access groups and turn each individual access into its own
2164    group.  */
2165 static void
2166 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2167 {
2168   unsigned int i;
2169   struct data_reference *dr;
2170
2171   DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2172
2173   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2174   FOR_EACH_VEC_ELT (datarefs, i, dr)
2175     {
2176       gcc_assert (DR_REF (dr));
2177       stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
2178
2179       /* Check if the load is a part of an interleaving chain.  */
2180       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2181         {
2182           stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2183           dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2184           unsigned int group_size = DR_GROUP_SIZE (first_element);
2185
2186           /* Check if SLP-only groups.  */
2187           if (!STMT_SLP_TYPE (stmt_info)
2188               && STMT_VINFO_SLP_VECT_ONLY (first_element))
2189             {
2190               /* Dissolve the group.  */
2191               STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2192
2193               stmt_vec_info vinfo = first_element;
2194               while (vinfo)
2195                 {
2196                   stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2197                   DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2198                   DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2199                   DR_GROUP_SIZE (vinfo) = 1;
2200                   if (STMT_VINFO_STRIDED_P (first_element))
2201                     DR_GROUP_GAP (vinfo) = 0;
2202                   else
2203                     DR_GROUP_GAP (vinfo) = group_size - 1;
2204                   /* Duplicate and adjust alignment info, it needs to
2205                      be present on each group leader, see dr_misalignment.  */
2206                   if (vinfo != first_element)
2207                     {
2208                       dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2209                       dr_info2->target_alignment = dr_info->target_alignment;
2210                       int misalignment = dr_info->misalignment;
2211                       if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2212                         {
2213                           HOST_WIDE_INT diff
2214                             = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2215                                - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2216                           unsigned HOST_WIDE_INT align_c
2217                             = dr_info->target_alignment.to_constant ();
2218                           misalignment = (misalignment + diff) % align_c;
2219                         }
2220                       dr_info2->misalignment = misalignment;
2221                     }
2222                   vinfo = next;
2223                 }
2224             }
2225         }
2226     }
2227 }
2228
2229 /* Determine if operating on full vectors for LOOP_VINFO might leave
2230    some scalar iterations still to do.  If so, decide how we should
2231    handle those scalar iterations.  The possibilities are:
2232
2233    (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2234        In this case:
2235
2236          LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2237          LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2238          LOOP_VINFO_PEELING_FOR_NITER == false
2239
2240    (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2241        to handle the remaining scalar iterations.  In this case:
2242
2243          LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2244          LOOP_VINFO_PEELING_FOR_NITER == true
2245
2246        There are two choices:
2247
2248        (2a) Consider vectorizing the epilogue loop at the same VF as the
2249             main loop, but using partial vectors instead of full vectors.
2250             In this case:
2251
2252               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2253
2254        (2b) Consider vectorizing the epilogue loop at lower VFs only.
2255             In this case:
2256
2257               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2258
2259    When FOR_EPILOGUE_P is true, make this determination based on the
2260    assumption that LOOP_VINFO is an epilogue loop, otherwise make it
2261    based on the assumption that LOOP_VINFO is the main loop.  The caller
2262    has made sure that the number of iterations is set appropriately for
2263    this value of FOR_EPILOGUE_P.  */
2264
2265 opt_result
2266 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo,
2267                                             bool for_epilogue_p)
2268 {
2269   /* Determine whether there would be any scalar iterations left over.  */
2270   bool need_peeling_or_partial_vectors_p
2271     = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2272
2273   /* Decide whether to vectorize the loop with partial vectors.  */
2274   LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2275   LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2276   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2277       && need_peeling_or_partial_vectors_p)
2278     {
2279       /* For partial-vector-usage=1, try to push the handling of partial
2280          vectors to the epilogue, with the main loop continuing to operate
2281          on full vectors.
2282
2283          If we are unrolling we also do not want to use partial vectors. This
2284          is to avoid the overhead of generating multiple masks and also to
2285          avoid having to execute entire iterations of FALSE masked instructions
2286          when dealing with one or less full iterations.
2287
2288          ??? We could then end up failing to use partial vectors if we
2289          decide to peel iterations into a prologue, and if the main loop
2290          then ends up processing fewer than VF iterations.  */
2291       if ((param_vect_partial_vector_usage == 1
2292            || loop_vinfo->suggested_unroll_factor > 1)
2293           && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2294           && !vect_known_niters_smaller_than_vf (loop_vinfo))
2295         LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2296       else
2297         LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2298     }
2299
2300   if (dump_enabled_p ())
2301     {
2302       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2303         dump_printf_loc (MSG_NOTE, vect_location,
2304                          "operating on partial vectors%s.\n",
2305                          for_epilogue_p ? " for epilogue loop" : "");
2306       else
2307         dump_printf_loc (MSG_NOTE, vect_location,
2308                          "operating only on full vectors%s.\n",
2309                          for_epilogue_p ? " for epilogue loop" : "");
2310     }
2311
2312   if (for_epilogue_p)
2313     {
2314       loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2315       gcc_assert (orig_loop_vinfo);
2316       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2317         gcc_assert (known_lt (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2318                               LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)));
2319     }
2320
2321   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2322       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2323     {
2324       /* Check that the loop processes at least one full vector.  */
2325       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2326       tree scalar_niters = LOOP_VINFO_NITERS (loop_vinfo);
2327       if (known_lt (wi::to_widest (scalar_niters), vf))
2328         return opt_result::failure_at (vect_location,
2329                                        "loop does not have enough iterations"
2330                                        " to support vectorization.\n");
2331
2332       /* If we need to peel an extra epilogue iteration to handle data
2333          accesses with gaps, check that there are enough scalar iterations
2334          available.
2335
2336          The check above is redundant with this one when peeling for gaps,
2337          but the distinction is useful for diagnostics.  */
2338       tree scalar_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
2339       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2340           && known_lt (wi::to_widest (scalar_nitersm1), vf))
2341         return opt_result::failure_at (vect_location,
2342                                        "loop does not have enough iterations"
2343                                        " to support peeling for gaps.\n");
2344     }
2345
2346   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2347     = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2348        && need_peeling_or_partial_vectors_p);
2349
2350   return opt_result::success ();
2351 }
2352
2353 /* Function vect_analyze_loop_2.
2354
2355    Apply a set of analyses on LOOP specified by LOOP_VINFO, the different
2356    analyses will record information in some members of LOOP_VINFO.  FATAL
2357    indicates if some analysis meets fatal error.  If one non-NULL pointer
2358    SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one
2359    worked out suggested unroll factor, while one NULL pointer shows it's
2360    going to apply the suggested unroll factor.  SLP_DONE_FOR_SUGGESTED_UF
2361    is to hold the slp decision when the suggested unroll factor is worked
2362    out.  */
2363 static opt_result
2364 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
2365                      unsigned *suggested_unroll_factor,
2366                      bool& slp_done_for_suggested_uf)
2367 {
2368   opt_result ok = opt_result::success ();
2369   int res;
2370   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2371   poly_uint64 min_vf = 2;
2372   loop_vec_info orig_loop_vinfo = NULL;
2373
2374   /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2375      loop_vec_info of the first vectorized loop.  */
2376   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2377     orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2378   else
2379     orig_loop_vinfo = loop_vinfo;
2380   gcc_assert (orig_loop_vinfo);
2381
2382   /* The first group of checks is independent of the vector size.  */
2383   fatal = true;
2384
2385   if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2386       && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2387     return opt_result::failure_at (vect_location,
2388                                    "not vectorized: simd if(0)\n");
2389
2390   /* Find all data references in the loop (which correspond to vdefs/vuses)
2391      and analyze their evolution in the loop.  */
2392
2393   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2394
2395   /* Gather the data references and count stmts in the loop.  */
2396   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2397     {
2398       opt_result res
2399         = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2400                                      &LOOP_VINFO_DATAREFS (loop_vinfo),
2401                                      &LOOP_VINFO_N_STMTS (loop_vinfo));
2402       if (!res)
2403         {
2404           if (dump_enabled_p ())
2405             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2406                              "not vectorized: loop contains function "
2407                              "calls or data references that cannot "
2408                              "be analyzed\n");
2409           return res;
2410         }
2411       loop_vinfo->shared->save_datarefs ();
2412     }
2413   else
2414     loop_vinfo->shared->check_datarefs ();
2415
2416   /* Analyze the data references and also adjust the minimal
2417      vectorization factor according to the loads and stores.  */
2418
2419   ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2420   if (!ok)
2421     {
2422       if (dump_enabled_p ())
2423         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2424                          "bad data references.\n");
2425       return ok;
2426     }
2427
2428   /* Check if we are applying unroll factor now.  */
2429   bool applying_suggested_uf = loop_vinfo->suggested_unroll_factor > 1;
2430   gcc_assert (!applying_suggested_uf || !suggested_unroll_factor);
2431
2432   /* If the slp decision is false when suggested unroll factor is worked
2433      out, and we are applying suggested unroll factor, we can simply skip
2434      all slp related analyses this time.  */
2435   bool slp = !applying_suggested_uf || slp_done_for_suggested_uf;
2436
2437   /* Classify all cross-iteration scalar data-flow cycles.
2438      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
2439   vect_analyze_scalar_cycles (loop_vinfo, slp);
2440
2441   vect_pattern_recog (loop_vinfo);
2442
2443   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2444
2445   /* Analyze the access patterns of the data-refs in the loop (consecutive,
2446      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
2447
2448   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2449   if (!ok)
2450     {
2451       if (dump_enabled_p ())
2452         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2453                          "bad data access.\n");
2454       return ok;
2455     }
2456
2457   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
2458
2459   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2460   if (!ok)
2461     {
2462       if (dump_enabled_p ())
2463         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2464                          "unexpected pattern.\n");
2465       return ok;
2466     }
2467
2468   /* While the rest of the analysis below depends on it in some way.  */
2469   fatal = false;
2470
2471   /* Analyze data dependences between the data-refs in the loop
2472      and adjust the maximum vectorization factor according to
2473      the dependences.
2474      FORNOW: fail at the first data dependence that we encounter.  */
2475
2476   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2477   if (!ok)
2478     {
2479       if (dump_enabled_p ())
2480         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2481                          "bad data dependence.\n");
2482       return ok;
2483     }
2484   if (max_vf != MAX_VECTORIZATION_FACTOR
2485       && maybe_lt (max_vf, min_vf))
2486     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2487   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2488
2489   ok = vect_determine_vectorization_factor (loop_vinfo);
2490   if (!ok)
2491     {
2492       if (dump_enabled_p ())
2493         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2494                          "can't determine vectorization factor.\n");
2495       return ok;
2496     }
2497   if (max_vf != MAX_VECTORIZATION_FACTOR
2498       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2499     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2500
2501   /* Compute the scalar iteration cost.  */
2502   vect_compute_single_scalar_iteration_cost (loop_vinfo);
2503
2504   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2505
2506   if (slp)
2507     {
2508       /* Check the SLP opportunities in the loop, analyze and build
2509          SLP trees.  */
2510       ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo));
2511       if (!ok)
2512         return ok;
2513
2514       /* If there are any SLP instances mark them as pure_slp.  */
2515       slp = vect_make_slp_decision (loop_vinfo);
2516       if (slp)
2517         {
2518           /* Find stmts that need to be both vectorized and SLPed.  */
2519           vect_detect_hybrid_slp (loop_vinfo);
2520
2521           /* Update the vectorization factor based on the SLP decision.  */
2522           vect_update_vf_for_slp (loop_vinfo);
2523
2524           /* Optimize the SLP graph with the vectorization factor fixed.  */
2525           vect_optimize_slp (loop_vinfo);
2526
2527           /* Gather the loads reachable from the SLP graph entries.  */
2528           vect_gather_slp_loads (loop_vinfo);
2529         }
2530     }
2531
2532   bool saved_can_use_partial_vectors_p
2533     = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2534
2535   /* We don't expect to have to roll back to anything other than an empty
2536      set of rgroups.  */
2537   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2538
2539   /* This is the point where we can re-start analysis with SLP forced off.  */
2540 start_over:
2541
2542   /* Apply the suggested unrolling factor, this was determined by the backend
2543      during finish_cost the first time we ran the analyzis for this
2544      vector mode.  */
2545   if (applying_suggested_uf)
2546     LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2547
2548   /* Now the vectorization factor is final.  */
2549   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2550   gcc_assert (known_ne (vectorization_factor, 0U));
2551
2552   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2553     {
2554       dump_printf_loc (MSG_NOTE, vect_location,
2555                        "vectorization_factor = ");
2556       dump_dec (MSG_NOTE, vectorization_factor);
2557       dump_printf (MSG_NOTE, ", niters = %wd\n",
2558                    LOOP_VINFO_INT_NITERS (loop_vinfo));
2559     }
2560
2561   loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2562
2563   /* Analyze the alignment of the data-refs in the loop.
2564      Fail if a data reference is found that cannot be vectorized.  */
2565
2566   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2567   if (!ok)
2568     {
2569       if (dump_enabled_p ())
2570         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2571                          "bad data alignment.\n");
2572       return ok;
2573     }
2574
2575   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2576      It is important to call pruning after vect_analyze_data_ref_accesses,
2577      since we use grouping information gathered by interleaving analysis.  */
2578   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2579   if (!ok)
2580     return ok;
2581
2582   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2583      vectorization, since we do not want to add extra peeling or
2584      add versioning for alignment.  */
2585   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2586     /* This pass will decide on using loop versioning and/or loop peeling in
2587        order to enhance the alignment of data references in the loop.  */
2588     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2589   if (!ok)
2590     return ok;
2591
2592   if (slp)
2593     {
2594       /* Analyze operations in the SLP instances.  Note this may
2595          remove unsupported SLP instances which makes the above
2596          SLP kind detection invalid.  */
2597       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2598       vect_slp_analyze_operations (loop_vinfo);
2599       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2600         {
2601           ok = opt_result::failure_at (vect_location,
2602                                        "unsupported SLP instances\n");
2603           goto again;
2604         }
2605
2606       /* Check whether any load in ALL SLP instances is possibly permuted.  */
2607       slp_tree load_node, slp_root;
2608       unsigned i, x;
2609       slp_instance instance;
2610       bool can_use_lanes = true;
2611       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2612         {
2613           slp_root = SLP_INSTANCE_TREE (instance);
2614           int group_size = SLP_TREE_LANES (slp_root);
2615           tree vectype = SLP_TREE_VECTYPE (slp_root);
2616           bool loads_permuted = false;
2617           FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2618             {
2619               if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2620                 continue;
2621               unsigned j;
2622               stmt_vec_info load_info;
2623               FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2624                 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2625                   {
2626                     loads_permuted = true;
2627                     break;
2628                   }
2629             }
2630
2631           /* If the loads and stores can be handled with load/store-lane
2632              instructions record it and move on to the next instance.  */
2633           if (loads_permuted
2634               && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2635               && vect_store_lanes_supported (vectype, group_size, false))
2636             {
2637               FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2638                 {
2639                   stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2640                       (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2641                   /* Use SLP for strided accesses (or if we can't
2642                      load-lanes).  */
2643                   if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2644                       || ! vect_load_lanes_supported
2645                             (STMT_VINFO_VECTYPE (stmt_vinfo),
2646                              DR_GROUP_SIZE (stmt_vinfo), false))
2647                     break;
2648                 }
2649
2650               can_use_lanes
2651                 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
2652
2653               if (can_use_lanes && dump_enabled_p ())
2654                 dump_printf_loc (MSG_NOTE, vect_location,
2655                                  "SLP instance %p can use load/store-lanes\n",
2656                                  (void *) instance);
2657             }
2658           else
2659             {
2660               can_use_lanes = false;
2661               break;
2662             }
2663         }
2664
2665       /* If all SLP instances can use load/store-lanes abort SLP and try again
2666          with SLP disabled.  */
2667       if (can_use_lanes)
2668         {
2669           ok = opt_result::failure_at (vect_location,
2670                                        "Built SLP cancelled: can use "
2671                                        "load/store-lanes\n");
2672           if (dump_enabled_p ())
2673             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2674                              "Built SLP cancelled: all SLP instances support "
2675                              "load/store-lanes\n");
2676           goto again;
2677         }
2678     }
2679
2680   /* Dissolve SLP-only groups.  */
2681   vect_dissolve_slp_only_groups (loop_vinfo);
2682
2683   /* Scan all the remaining operations in the loop that are not subject
2684      to SLP and make sure they are vectorizable.  */
2685   ok = vect_analyze_loop_operations (loop_vinfo);
2686   if (!ok)
2687     {
2688       if (dump_enabled_p ())
2689         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2690                          "bad operation or unsupported loop bound.\n");
2691       return ok;
2692     }
2693
2694   /* For now, we don't expect to mix both masking and length approaches for one
2695      loop, disable it if both are recorded.  */
2696   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2697       && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2698       && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2699     {
2700       if (dump_enabled_p ())
2701         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2702                          "can't vectorize a loop with partial vectors"
2703                          " because we don't expect to mix different"
2704                          " approaches with partial vectors for the"
2705                          " same loop.\n");
2706       LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2707     }
2708
2709   /* If we still have the option of using partial vectors,
2710      check whether we can generate the necessary loop controls.  */
2711   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2712       && !vect_verify_full_masking (loop_vinfo)
2713       && !vect_verify_loop_lens (loop_vinfo))
2714     LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2715
2716   /* If we're vectorizing an epilogue loop, the vectorized loop either needs
2717      to be able to handle fewer than VF scalars, or needs to have a lower VF
2718      than the main loop.  */
2719   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2720       && !LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2721       && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2722                    LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
2723     return opt_result::failure_at (vect_location,
2724                                    "Vectorization factor too high for"
2725                                    " epilogue loop.\n");
2726
2727   /* Decide whether this loop_vinfo should use partial vectors or peeling,
2728      assuming that the loop will be used as a main loop.  We will redo
2729      this analysis later if we instead decide to use the loop as an
2730      epilogue loop.  */
2731   ok = vect_determine_partial_vectors_and_peeling (loop_vinfo, false);
2732   if (!ok)
2733     return ok;
2734
2735   /* Check the costings of the loop make vectorizing worthwhile.  */
2736   res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
2737   if (res < 0)
2738     {
2739       ok = opt_result::failure_at (vect_location,
2740                                    "Loop costings may not be worthwhile.\n");
2741       goto again;
2742     }
2743   if (!res)
2744     return opt_result::failure_at (vect_location,
2745                                    "Loop costings not worthwhile.\n");
2746
2747   /* If an epilogue loop is required make sure we can create one.  */
2748   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2749       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2750     {
2751       if (dump_enabled_p ())
2752         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2753       if (!vect_can_advance_ivs_p (loop_vinfo)
2754           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2755                                            single_exit (LOOP_VINFO_LOOP
2756                                                          (loop_vinfo))))
2757         {
2758           ok = opt_result::failure_at (vect_location,
2759                                        "not vectorized: can't create required "
2760                                        "epilog loop\n");
2761           goto again;
2762         }
2763     }
2764
2765   /* During peeling, we need to check if number of loop iterations is
2766      enough for both peeled prolog loop and vector loop.  This check
2767      can be merged along with threshold check of loop versioning, so
2768      increase threshold for this case if necessary.
2769
2770      If we are analyzing an epilogue we still want to check what its
2771      versioning threshold would be.  If we decide to vectorize the epilogues we
2772      will want to use the lowest versioning threshold of all epilogues and main
2773      loop.  This will enable us to enter a vectorized epilogue even when
2774      versioning the loop.  We can't simply check whether the epilogue requires
2775      versioning though since we may have skipped some versioning checks when
2776      analyzing the epilogue.  For instance, checks for alias versioning will be
2777      skipped when dealing with epilogues as we assume we already checked them
2778      for the main loop.  So instead we always check the 'orig_loop_vinfo'.  */
2779   if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2780     {
2781       poly_uint64 niters_th = 0;
2782       unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2783
2784       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2785         {
2786           /* Niters for peeled prolog loop.  */
2787           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2788             {
2789               dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2790               tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2791               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2792             }
2793           else
2794             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2795         }
2796
2797       /* Niters for at least one iteration of vectorized loop.  */
2798       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2799         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2800       /* One additional iteration because of peeling for gap.  */
2801       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2802         niters_th += 1;
2803
2804       /*  Use the same condition as vect_transform_loop to decide when to use
2805           the cost to determine a versioning threshold.  */
2806       if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2807           && ordered_p (th, niters_th))
2808         niters_th = ordered_max (poly_uint64 (th), niters_th);
2809
2810       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2811     }
2812
2813   gcc_assert (known_eq (vectorization_factor,
2814                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2815
2816   slp_done_for_suggested_uf = slp;
2817
2818   /* Ok to vectorize!  */
2819   LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2820   return opt_result::success ();
2821
2822 again:
2823   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
2824   gcc_assert (!ok);
2825
2826   /* Try again with SLP forced off but if we didn't do any SLP there is
2827      no point in re-trying.  */
2828   if (!slp)
2829     return ok;
2830
2831   /* If the slp decision is true when suggested unroll factor is worked
2832      out, and we are applying suggested unroll factor, we don't need to
2833      re-try any more.  */
2834   if (applying_suggested_uf && slp_done_for_suggested_uf)
2835     return ok;
2836
2837   /* If there are reduction chains re-trying will fail anyway.  */
2838   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2839     return ok;
2840
2841   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2842      via interleaving or lane instructions.  */
2843   slp_instance instance;
2844   slp_tree node;
2845   unsigned i, j;
2846   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2847     {
2848       stmt_vec_info vinfo;
2849       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2850       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2851         continue;
2852       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2853       unsigned int size = DR_GROUP_SIZE (vinfo);
2854       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2855       if (! vect_store_lanes_supported (vectype, size, false)
2856          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2857          && ! vect_grouped_store_supported (vectype, size))
2858         return opt_result::failure_at (vinfo->stmt,
2859                                        "unsupported grouped store\n");
2860       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2861         {
2862           vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2863           vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2864           bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2865           size = DR_GROUP_SIZE (vinfo);
2866           vectype = STMT_VINFO_VECTYPE (vinfo);
2867           if (! vect_load_lanes_supported (vectype, size, false)
2868               && ! vect_grouped_load_supported (vectype, single_element_p,
2869                                                 size))
2870             return opt_result::failure_at (vinfo->stmt,
2871                                            "unsupported grouped load\n");
2872         }
2873     }
2874
2875   if (dump_enabled_p ())
2876     dump_printf_loc (MSG_NOTE, vect_location,
2877                      "re-trying with SLP disabled\n");
2878
2879   /* Roll back state appropriately.  No SLP this time.  */
2880   slp = false;
2881   /* Restore vectorization factor as it were without SLP.  */
2882   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2883   /* Free the SLP instances.  */
2884   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2885     vect_free_slp_instance (instance);
2886   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2887   /* Reset SLP type to loop_vect on all stmts.  */
2888   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2889     {
2890       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2891       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2892            !gsi_end_p (si); gsi_next (&si))
2893         {
2894           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2895           STMT_SLP_TYPE (stmt_info) = loop_vect;
2896           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2897               || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2898             {
2899               /* vectorizable_reduction adjusts reduction stmt def-types,
2900                  restore them to that of the PHI.  */
2901               STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2902                 = STMT_VINFO_DEF_TYPE (stmt_info);
2903               STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2904                                         (STMT_VINFO_REDUC_DEF (stmt_info)))
2905                 = STMT_VINFO_DEF_TYPE (stmt_info);
2906             }
2907         }
2908       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2909            !gsi_end_p (si); gsi_next (&si))
2910         {
2911           if (is_gimple_debug (gsi_stmt (si)))
2912             continue;
2913           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2914           STMT_SLP_TYPE (stmt_info) = loop_vect;
2915           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2916             {
2917               stmt_vec_info pattern_stmt_info
2918                 = STMT_VINFO_RELATED_STMT (stmt_info);
2919               if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
2920                 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
2921
2922               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2923               STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
2924               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2925                    !gsi_end_p (pi); gsi_next (&pi))
2926                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2927                   = loop_vect;
2928             }
2929         }
2930     }
2931   /* Free optimized alias test DDRS.  */
2932   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2933   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2934   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2935   /* Reset target cost data.  */
2936   delete loop_vinfo->vector_costs;
2937   loop_vinfo->vector_costs = nullptr;
2938   /* Reset accumulated rgroup information.  */
2939   release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo));
2940   release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
2941   /* Reset assorted flags.  */
2942   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2943   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2944   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2945   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2946   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2947     = saved_can_use_partial_vectors_p;
2948
2949   goto start_over;
2950 }
2951
2952 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2953    to be better than vectorizing it using OLD_LOOP_VINFO.  Assume that
2954    OLD_LOOP_VINFO is better unless something specifically indicates
2955    otherwise.
2956
2957    Note that this deliberately isn't a partial order.  */
2958
2959 static bool
2960 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2961                           loop_vec_info old_loop_vinfo)
2962 {
2963   struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2964   gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2965
2966   poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2967   poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2968
2969   /* Always prefer a VF of loop->simdlen over any other VF.  */
2970   if (loop->simdlen)
2971     {
2972       bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2973       bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2974       if (new_simdlen_p != old_simdlen_p)
2975         return new_simdlen_p;
2976     }
2977
2978   const auto *old_costs = old_loop_vinfo->vector_costs;
2979   const auto *new_costs = new_loop_vinfo->vector_costs;
2980   if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
2981     return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
2982
2983   return new_costs->better_main_loop_than_p (old_costs);
2984 }
2985
2986 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO.  Return
2987    true if we should.  */
2988
2989 static bool
2990 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
2991                         loop_vec_info old_loop_vinfo)
2992 {
2993   if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
2994     return false;
2995
2996   if (dump_enabled_p ())
2997     dump_printf_loc (MSG_NOTE, vect_location,
2998                      "***** Preferring vector mode %s to vector mode %s\n",
2999                      GET_MODE_NAME (new_loop_vinfo->vector_mode),
3000                      GET_MODE_NAME (old_loop_vinfo->vector_mode));
3001   return true;
3002 }
3003
3004 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
3005    not NULL.  Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
3006    MODE_I to the next mode useful to analyze.
3007    Return the loop_vinfo on success and wrapped null on failure.  */
3008
3009 static opt_loop_vec_info
3010 vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
3011                      const vect_loop_form_info *loop_form_info,
3012                      loop_vec_info main_loop_vinfo,
3013                      const vector_modes &vector_modes, unsigned &mode_i,
3014                      machine_mode &autodetected_vector_mode,
3015                      bool &fatal)
3016 {
3017   loop_vec_info loop_vinfo
3018     = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3019
3020   machine_mode vector_mode = vector_modes[mode_i];
3021   loop_vinfo->vector_mode = vector_mode;
3022   unsigned int suggested_unroll_factor = 1;
3023   bool slp_done_for_suggested_uf;
3024
3025   /* Run the main analysis.  */
3026   opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,
3027                                         &suggested_unroll_factor,
3028                                         slp_done_for_suggested_uf);
3029   if (dump_enabled_p ())
3030     dump_printf_loc (MSG_NOTE, vect_location,
3031                      "***** Analysis %s with vector mode %s\n",
3032                      res ? "succeeded" : " failed",
3033                      GET_MODE_NAME (loop_vinfo->vector_mode));
3034
3035   if (!main_loop_vinfo && suggested_unroll_factor > 1)
3036     {
3037       if (dump_enabled_p ())
3038         dump_printf_loc (MSG_NOTE, vect_location,
3039                          "***** Re-trying analysis for unrolling"
3040                          " with unroll factor %d and slp %s.\n",
3041                          suggested_unroll_factor,
3042                          slp_done_for_suggested_uf ? "on" : "off");
3043       loop_vec_info unroll_vinfo
3044         = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3045       unroll_vinfo->vector_mode = vector_mode;
3046       unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
3047       opt_result new_res = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL,
3048                                                 slp_done_for_suggested_uf);
3049       if (new_res)
3050         {
3051           delete loop_vinfo;
3052           loop_vinfo = unroll_vinfo;
3053         }
3054       else
3055         delete unroll_vinfo;
3056     }
3057
3058   /* Remember the autodetected vector mode.  */
3059   if (vector_mode == VOIDmode)
3060     autodetected_vector_mode = loop_vinfo->vector_mode;
3061
3062   /* Advance mode_i, first skipping modes that would result in the
3063      same analysis result.  */
3064   while (mode_i + 1 < vector_modes.length ()
3065          && vect_chooses_same_modes_p (loop_vinfo,
3066                                        vector_modes[mode_i + 1]))
3067     {
3068       if (dump_enabled_p ())
3069         dump_printf_loc (MSG_NOTE, vect_location,
3070                          "***** The result for vector mode %s would"
3071                          " be the same\n",
3072                          GET_MODE_NAME (vector_modes[mode_i + 1]));
3073       mode_i += 1;
3074     }
3075   if (mode_i + 1 < vector_modes.length ()
3076       && VECTOR_MODE_P (autodetected_vector_mode)
3077       && (related_vector_mode (vector_modes[mode_i + 1],
3078                                GET_MODE_INNER (autodetected_vector_mode))
3079           == autodetected_vector_mode)
3080       && (related_vector_mode (autodetected_vector_mode,
3081                                GET_MODE_INNER (vector_modes[mode_i + 1]))
3082           == vector_modes[mode_i + 1]))
3083     {
3084       if (dump_enabled_p ())
3085         dump_printf_loc (MSG_NOTE, vect_location,
3086                          "***** Skipping vector mode %s, which would"
3087                          " repeat the analysis for %s\n",
3088                          GET_MODE_NAME (vector_modes[mode_i + 1]),
3089                          GET_MODE_NAME (autodetected_vector_mode));
3090       mode_i += 1;
3091     }
3092   mode_i++;
3093
3094   if (!res)
3095     {
3096       delete loop_vinfo;
3097       if (fatal)
3098         gcc_checking_assert (main_loop_vinfo == NULL);
3099       return opt_loop_vec_info::propagate_failure (res);
3100     }
3101
3102   return opt_loop_vec_info::success (loop_vinfo);
3103 }
3104
3105 /* Function vect_analyze_loop.
3106
3107    Apply a set of analyses on LOOP, and create a loop_vec_info struct
3108    for it.  The different analyses will record information in the
3109    loop_vec_info struct.  */
3110 opt_loop_vec_info
3111 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
3112 {
3113   DUMP_VECT_SCOPE ("analyze_loop_nest");
3114
3115   if (loop_outer (loop)
3116       && loop_vec_info_for_loop (loop_outer (loop))
3117       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
3118     return opt_loop_vec_info::failure_at (vect_location,
3119                                           "outer-loop already vectorized.\n");
3120
3121   if (!find_loop_nest (loop, &shared->loop_nest))
3122     return opt_loop_vec_info::failure_at
3123       (vect_location,
3124        "not vectorized: loop nest containing two or more consecutive inner"
3125        " loops cannot be vectorized\n");
3126
3127   /* Analyze the loop form.  */
3128   vect_loop_form_info loop_form_info;
3129   opt_result res = vect_analyze_loop_form (loop, &loop_form_info);
3130   if (!res)
3131     {
3132       if (dump_enabled_p ())
3133         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3134                          "bad loop form.\n");
3135       return opt_loop_vec_info::propagate_failure (res);
3136     }
3137   if (!integer_onep (loop_form_info.assumptions))
3138     {
3139       /* We consider to vectorize this loop by versioning it under
3140          some assumptions.  In order to do this, we need to clear
3141          existing information computed by scev and niter analyzer.  */
3142       scev_reset_htab ();
3143       free_numbers_of_iterations_estimates (loop);
3144       /* Also set flag for this loop so that following scev and niter
3145          analysis are done under the assumptions.  */
3146       loop_constraint_set (loop, LOOP_C_FINITE);
3147     }
3148
3149   auto_vector_modes vector_modes;
3150   /* Autodetect first vector size we try.  */
3151   vector_modes.safe_push (VOIDmode);
3152   unsigned int autovec_flags
3153     = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
3154                                                     loop->simdlen != 0);
3155   bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
3156                              && !unlimited_cost_model (loop));
3157   machine_mode autodetected_vector_mode = VOIDmode;
3158   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
3159   unsigned int mode_i = 0;
3160   unsigned HOST_WIDE_INT simdlen = loop->simdlen;
3161
3162   /* Keep track of the VF for each mode.  Initialize all to 0 which indicates
3163      a mode has not been analyzed.  */
3164   auto_vec<poly_uint64, 8> cached_vf_per_mode;
3165   for (unsigned i = 0; i < vector_modes.length (); ++i)
3166     cached_vf_per_mode.safe_push (0);
3167
3168   /* First determine the main loop vectorization mode, either the first
3169      one that works, starting with auto-detecting the vector mode and then
3170      following the targets order of preference, or the one with the
3171      lowest cost if pick_lowest_cost_p.  */
3172   while (1)
3173     {
3174       bool fatal;
3175       unsigned int last_mode_i = mode_i;
3176       /* Set cached VF to -1 prior to analysis, which indicates a mode has
3177          failed.  */
3178       cached_vf_per_mode[last_mode_i] = -1;
3179       opt_loop_vec_info loop_vinfo
3180         = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3181                                NULL, vector_modes, mode_i,
3182                                autodetected_vector_mode, fatal);
3183       if (fatal)
3184         break;
3185
3186       if (loop_vinfo)
3187         {
3188           /*  Analyzis has been successful so update the VF value.  The
3189               VF should always be a multiple of unroll_factor and we want to
3190               capture the original VF here.  */
3191           cached_vf_per_mode[last_mode_i]
3192             = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
3193                          loop_vinfo->suggested_unroll_factor);
3194           /* Once we hit the desired simdlen for the first time,
3195              discard any previous attempts.  */
3196           if (simdlen
3197               && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3198             {
3199               delete first_loop_vinfo;
3200               first_loop_vinfo = opt_loop_vec_info::success (NULL);
3201               simdlen = 0;
3202             }
3203           else if (pick_lowest_cost_p
3204                    && first_loop_vinfo
3205                    && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3206             {
3207               /* Pick loop_vinfo over first_loop_vinfo.  */
3208               delete first_loop_vinfo;
3209               first_loop_vinfo = opt_loop_vec_info::success (NULL);
3210             }
3211           if (first_loop_vinfo == NULL)
3212             first_loop_vinfo = loop_vinfo;
3213           else
3214             {
3215               delete loop_vinfo;
3216               loop_vinfo = opt_loop_vec_info::success (NULL);
3217             }
3218
3219           /* Commit to first_loop_vinfo if we have no reason to try
3220              alternatives.  */
3221           if (!simdlen && !pick_lowest_cost_p)
3222             break;
3223         }
3224       if (mode_i == vector_modes.length ()
3225           || autodetected_vector_mode == VOIDmode)
3226         break;
3227
3228       /* Try the next biggest vector size.  */
3229       if (dump_enabled_p ())
3230         dump_printf_loc (MSG_NOTE, vect_location,
3231                          "***** Re-trying analysis with vector mode %s\n",
3232                          GET_MODE_NAME (vector_modes[mode_i]));
3233     }
3234   if (!first_loop_vinfo)
3235     return opt_loop_vec_info::propagate_failure (res);
3236
3237   if (dump_enabled_p ())
3238     dump_printf_loc (MSG_NOTE, vect_location,
3239                      "***** Choosing vector mode %s\n",
3240                      GET_MODE_NAME (first_loop_vinfo->vector_mode));
3241
3242   /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3243      enabled, SIMDUID is not set, it is the innermost loop and we have
3244      either already found the loop's SIMDLEN or there was no SIMDLEN to
3245      begin with.
3246      TODO: Enable epilogue vectorization for loops with SIMDUID set.  */
3247   bool vect_epilogues = (!simdlen
3248                          && loop->inner == NULL
3249                          && param_vect_epilogues_nomask
3250                          && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3251                          && !loop->simduid);
3252   if (!vect_epilogues)
3253     return first_loop_vinfo;
3254
3255   /* Now analyze first_loop_vinfo for epilogue vectorization.  */
3256   poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3257
3258   /* For epilogues start the analysis from the first mode.  The motivation
3259      behind starting from the beginning comes from cases where the VECTOR_MODES
3260      array may contain length-agnostic and length-specific modes.  Their
3261      ordering is not guaranteed, so we could end up picking a mode for the main
3262      loop that is after the epilogue's optimal mode.  */
3263   vector_modes[0] = autodetected_vector_mode;
3264   mode_i = 0;
3265
3266   bool supports_partial_vectors =
3267     partial_vectors_supported_p () && param_vect_partial_vector_usage != 0;
3268   poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3269
3270   while (1)
3271     {
3272       /* If the target does not support partial vectors we can shorten the
3273          number of modes to analyze for the epilogue as we know we can't pick a
3274          mode that would lead to a VF at least as big as the
3275          FIRST_VINFO_VF.  */
3276       if (!supports_partial_vectors
3277           && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3278         {
3279           mode_i++;
3280           if (mode_i == vector_modes.length ())
3281             break;
3282           continue;
3283         }
3284
3285       if (dump_enabled_p ())
3286         dump_printf_loc (MSG_NOTE, vect_location,
3287                          "***** Re-trying epilogue analysis with vector "
3288                          "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3289
3290       bool fatal;
3291       opt_loop_vec_info loop_vinfo
3292         = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3293                                first_loop_vinfo,
3294                                vector_modes, mode_i,
3295                                autodetected_vector_mode, fatal);
3296       if (fatal)
3297         break;
3298
3299       if (loop_vinfo)
3300         {
3301           if (pick_lowest_cost_p)
3302             {
3303               /* Keep trying to roll back vectorization attempts while the
3304                  loop_vec_infos they produced were worse than this one.  */
3305               vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3306               while (!vinfos.is_empty ()
3307                      && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3308                 {
3309                   gcc_assert (vect_epilogues);
3310                   delete vinfos.pop ();
3311                 }
3312             }
3313           /* For now only allow one epilogue loop.  */
3314           if (first_loop_vinfo->epilogue_vinfos.is_empty ())
3315             {
3316               first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3317               poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3318               gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3319                           || maybe_ne (lowest_th, 0U));
3320               /* Keep track of the known smallest versioning
3321                  threshold.  */
3322               if (ordered_p (lowest_th, th))
3323                 lowest_th = ordered_min (lowest_th, th);
3324             }
3325           else
3326             {
3327               delete loop_vinfo;
3328               loop_vinfo = opt_loop_vec_info::success (NULL);
3329             }
3330
3331           /* For now only allow one epilogue loop, but allow
3332              pick_lowest_cost_p to replace it, so commit to the
3333              first epilogue if we have no reason to try alternatives.  */
3334           if (!pick_lowest_cost_p)
3335             break;
3336         }
3337
3338       if (mode_i == vector_modes.length ())
3339         break;
3340
3341     }
3342
3343   if (!first_loop_vinfo->epilogue_vinfos.is_empty ())
3344     {
3345       LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3346       if (dump_enabled_p ())
3347         dump_printf_loc (MSG_NOTE, vect_location,
3348                          "***** Choosing epilogue vector mode %s\n",
3349                          GET_MODE_NAME
3350                            (first_loop_vinfo->epilogue_vinfos[0]->vector_mode));
3351     }
3352
3353   return first_loop_vinfo;
3354 }
3355
3356 /* Return true if there is an in-order reduction function for CODE, storing
3357    it in *REDUC_FN if so.  */
3358
3359 static bool
3360 fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3361 {
3362   if (code == PLUS_EXPR)
3363     {
3364       *reduc_fn = IFN_FOLD_LEFT_PLUS;
3365       return true;
3366     }
3367   return false;
3368 }
3369
3370 /* Function reduction_fn_for_scalar_code
3371
3372    Input:
3373    CODE - tree_code of a reduction operations.
3374
3375    Output:
3376    REDUC_FN - the corresponding internal function to be used to reduce the
3377       vector of partial results into a single scalar result, or IFN_LAST
3378       if the operation is a supported reduction operation, but does not have
3379       such an internal function.
3380
3381    Return FALSE if CODE currently cannot be vectorized as reduction.  */
3382
3383 bool
3384 reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3385 {
3386   if (code.is_tree_code ())
3387     switch (tree_code (code))
3388       {
3389       case MAX_EXPR:
3390         *reduc_fn = IFN_REDUC_MAX;
3391         return true;
3392
3393       case MIN_EXPR:
3394         *reduc_fn = IFN_REDUC_MIN;
3395         return true;
3396
3397       case PLUS_EXPR:
3398         *reduc_fn = IFN_REDUC_PLUS;
3399         return true;
3400
3401       case BIT_AND_EXPR:
3402         *reduc_fn = IFN_REDUC_AND;
3403         return true;
3404
3405       case BIT_IOR_EXPR:
3406         *reduc_fn = IFN_REDUC_IOR;
3407         return true;
3408
3409       case BIT_XOR_EXPR:
3410         *reduc_fn = IFN_REDUC_XOR;
3411         return true;
3412
3413       case MULT_EXPR:
3414       case MINUS_EXPR:
3415         *reduc_fn = IFN_LAST;
3416         return true;
3417
3418       default:
3419         return false;
3420       }
3421   else
3422     switch (combined_fn (code))
3423       {
3424       CASE_CFN_FMAX:
3425         *reduc_fn = IFN_REDUC_FMAX;
3426         return true;
3427
3428       CASE_CFN_FMIN:
3429         *reduc_fn = IFN_REDUC_FMIN;
3430         return true;
3431
3432       default:
3433         return false;
3434       }
3435 }
3436
3437 /* If there is a neutral value X such that a reduction would not be affected
3438    by the introduction of additional X elements, return that X, otherwise
3439    return null.  CODE is the code of the reduction and SCALAR_TYPE is type
3440    of the scalar elements.  If the reduction has just a single initial value
3441    then INITIAL_VALUE is that value, otherwise it is null.  */
3442
3443 tree
3444 neutral_op_for_reduction (tree scalar_type, code_helper code,
3445                           tree initial_value)
3446 {
3447   if (code.is_tree_code ())
3448     switch (tree_code (code))
3449       {
3450       case WIDEN_SUM_EXPR:
3451       case DOT_PROD_EXPR:
3452       case SAD_EXPR:
3453       case PLUS_EXPR:
3454       case MINUS_EXPR:
3455       case BIT_IOR_EXPR:
3456       case BIT_XOR_EXPR:
3457         return build_zero_cst (scalar_type);
3458
3459       case MULT_EXPR:
3460         return build_one_cst (scalar_type);
3461
3462       case BIT_AND_EXPR:
3463         return build_all_ones_cst (scalar_type);
3464
3465       case MAX_EXPR:
3466       case MIN_EXPR:
3467         return initial_value;
3468
3469       default:
3470         return NULL_TREE;
3471       }
3472   else
3473     switch (combined_fn (code))
3474       {
3475       CASE_CFN_FMIN:
3476       CASE_CFN_FMAX:
3477         return initial_value;
3478
3479       default:
3480         return NULL_TREE;
3481       }
3482 }
3483
3484 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
3485    STMT is printed with a message MSG. */
3486
3487 static void
3488 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3489 {
3490   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3491 }
3492
3493 /* Return true if we need an in-order reduction for operation CODE
3494    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3495    overflow must wrap.  */
3496
3497 bool
3498 needs_fold_left_reduction_p (tree type, code_helper code)
3499 {
3500   /* CHECKME: check for !flag_finite_math_only too?  */
3501   if (SCALAR_FLOAT_TYPE_P (type))
3502     {
3503       if (code.is_tree_code ())
3504         switch (tree_code (code))
3505           {
3506           case MIN_EXPR:
3507           case MAX_EXPR:
3508             return false;
3509
3510           default:
3511             return !flag_associative_math;
3512           }
3513       else
3514         switch (combined_fn (code))
3515           {
3516           CASE_CFN_FMIN:
3517           CASE_CFN_FMAX:
3518             return false;
3519
3520           default:
3521             return !flag_associative_math;
3522           }
3523     }
3524
3525   if (INTEGRAL_TYPE_P (type))
3526     return (!code.is_tree_code ()
3527             || !operation_no_trapping_overflow (type, tree_code (code)));
3528
3529   if (SAT_FIXED_POINT_TYPE_P (type))
3530     return true;
3531
3532   return false;
3533 }
3534
3535 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3536    has a handled computation expression.  Store the main reduction
3537    operation in *CODE.  */
3538
3539 static bool
3540 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3541                       tree loop_arg, code_helper *code,
3542                       vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3543 {
3544   auto_bitmap visited;
3545   tree lookfor = PHI_RESULT (phi);
3546   ssa_op_iter curri;
3547   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3548   while (USE_FROM_PTR (curr) != loop_arg)
3549     curr = op_iter_next_use (&curri);
3550   curri.i = curri.numops;
3551   do
3552     {
3553       path.safe_push (std::make_pair (curri, curr));
3554       tree use = USE_FROM_PTR (curr);
3555       if (use == lookfor)
3556         break;
3557       gimple *def = SSA_NAME_DEF_STMT (use);
3558       if (gimple_nop_p (def)
3559           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3560         {
3561 pop:
3562           do
3563             {
3564               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3565               curri = x.first;
3566               curr = x.second;
3567               do
3568                 curr = op_iter_next_use (&curri);
3569               /* Skip already visited or non-SSA operands (from iterating
3570                  over PHI args).  */
3571               while (curr != NULL_USE_OPERAND_P
3572                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3573                          || ! bitmap_set_bit (visited,
3574                                               SSA_NAME_VERSION
3575                                                 (USE_FROM_PTR (curr)))));
3576             }
3577           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3578           if (curr == NULL_USE_OPERAND_P)
3579             break;
3580         }
3581       else
3582         {
3583           if (gimple_code (def) == GIMPLE_PHI)
3584             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3585           else
3586             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3587           while (curr != NULL_USE_OPERAND_P
3588                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3589                      || ! bitmap_set_bit (visited,
3590                                           SSA_NAME_VERSION
3591                                             (USE_FROM_PTR (curr)))))
3592             curr = op_iter_next_use (&curri);
3593           if (curr == NULL_USE_OPERAND_P)
3594             goto pop;
3595         }
3596     }
3597   while (1);
3598   if (dump_file && (dump_flags & TDF_DETAILS))
3599     {
3600       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3601       unsigned i;
3602       std::pair<ssa_op_iter, use_operand_p> *x;
3603       FOR_EACH_VEC_ELT (path, i, x)
3604         dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3605       dump_printf (MSG_NOTE, "\n");
3606     }
3607
3608   /* Check whether the reduction path detected is valid.  */
3609   bool fail = path.length () == 0;
3610   bool neg = false;
3611   int sign = -1;
3612   *code = ERROR_MARK;
3613   for (unsigned i = 1; i < path.length (); ++i)
3614     {
3615       gimple *use_stmt = USE_STMT (path[i].second);
3616       gimple_match_op op;
3617       if (!gimple_extract_op (use_stmt, &op))
3618         {
3619           fail = true;
3620           break;
3621         }
3622       unsigned int opi = op.num_ops;
3623       if (gassign *assign = dyn_cast<gassign *> (use_stmt))
3624         {
3625           /* The following make sure we can compute the operand index
3626              easily plus it mostly disallows chaining via COND_EXPR condition
3627              operands.  */
3628           for (opi = 0; opi < op.num_ops; ++opi)
3629             if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
3630               break;
3631         }
3632       else if (gcall *call = dyn_cast<gcall *> (use_stmt))
3633         {
3634           for (opi = 0; opi < op.num_ops; ++opi)
3635             if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
3636               break;
3637         }
3638       if (opi == op.num_ops)
3639         {
3640           fail = true;
3641           break;
3642         }
3643       op.code = canonicalize_code (op.code, op.type);
3644       if (op.code == MINUS_EXPR)
3645         {
3646           op.code = PLUS_EXPR;
3647           /* Track whether we negate the reduction value each iteration.  */
3648           if (op.ops[1] == op.ops[opi])
3649             neg = ! neg;
3650         }
3651       if (CONVERT_EXPR_CODE_P (op.code)
3652           && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
3653         ;
3654       else if (*code == ERROR_MARK)
3655         {
3656           *code = op.code;
3657           sign = TYPE_SIGN (op.type);
3658         }
3659       else if (op.code != *code)
3660         {
3661           fail = true;
3662           break;
3663         }
3664       else if ((op.code == MIN_EXPR
3665                 || op.code == MAX_EXPR)
3666                && sign != TYPE_SIGN (op.type))
3667         {
3668           fail = true;
3669           break;
3670         }
3671       /* Check there's only a single stmt the op is used on.  For the
3672          not value-changing tail and the last stmt allow out-of-loop uses.
3673          ???  We could relax this and handle arbitrary live stmts by
3674          forcing a scalar epilogue for example.  */
3675       imm_use_iterator imm_iter;
3676       gimple *op_use_stmt;
3677       unsigned cnt = 0;
3678       FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
3679         if (!is_gimple_debug (op_use_stmt)
3680             && (*code != ERROR_MARK
3681                 || flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))))
3682           {
3683             /* We want to allow x + x but not x < 1 ? x : 2.  */
3684             if (is_gimple_assign (op_use_stmt)
3685                 && gimple_assign_rhs_code (op_use_stmt) == COND_EXPR)
3686               {
3687                 use_operand_p use_p;
3688                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3689                   cnt++;
3690               }
3691             else
3692               cnt++;
3693           }
3694       if (cnt != 1)
3695         {
3696           fail = true;
3697           break;
3698         }
3699     }
3700   return ! fail && ! neg && *code != ERROR_MARK;
3701 }
3702
3703 bool
3704 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3705                       tree loop_arg, enum tree_code code)
3706 {
3707   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3708   code_helper code_;
3709   return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
3710           && code_ == code);
3711 }
3712
3713
3714
3715 /* Function vect_is_simple_reduction
3716
3717    (1) Detect a cross-iteration def-use cycle that represents a simple
3718    reduction computation.  We look for the following pattern:
3719
3720    loop_header:
3721      a1 = phi < a0, a2 >
3722      a3 = ...
3723      a2 = operation (a3, a1)
3724
3725    or
3726
3727    a3 = ...
3728    loop_header:
3729      a1 = phi < a0, a2 >
3730      a2 = operation (a3, a1)
3731
3732    such that:
3733    1. operation is commutative and associative and it is safe to
3734       change the order of the computation
3735    2. no uses for a2 in the loop (a2 is used out of the loop)
3736    3. no uses of a1 in the loop besides the reduction operation
3737    4. no uses of a1 outside the loop.
3738
3739    Conditions 1,4 are tested here.
3740    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3741
3742    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3743    nested cycles.
3744
3745    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3746    reductions:
3747
3748      a1 = phi < a0, a2 >
3749      inner loop (def of a3)
3750      a2 = phi < a3 >
3751
3752    (4) Detect condition expressions, ie:
3753      for (int i = 0; i < N; i++)
3754        if (a[i] < val)
3755         ret_val = a[i];
3756
3757 */
3758
3759 static stmt_vec_info
3760 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3761                           bool *double_reduc, bool *reduc_chain_p, bool slp)
3762 {
3763   gphi *phi = as_a <gphi *> (phi_info->stmt);
3764   gimple *phi_use_stmt = NULL;
3765   imm_use_iterator imm_iter;
3766   use_operand_p use_p;
3767
3768   *double_reduc = false;
3769   *reduc_chain_p = false;
3770   STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
3771
3772   tree phi_name = PHI_RESULT (phi);
3773   /* ???  If there are no uses of the PHI result the inner loop reduction
3774      won't be detected as possibly double-reduction by vectorizable_reduction
3775      because that tries to walk the PHI arg from the preheader edge which
3776      can be constant.  See PR60382.  */
3777   if (has_zero_uses (phi_name))
3778     return NULL;
3779   class loop *loop = (gimple_bb (phi))->loop_father;
3780   unsigned nphi_def_loop_uses = 0;
3781   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3782     {
3783       gimple *use_stmt = USE_STMT (use_p);
3784       if (is_gimple_debug (use_stmt))
3785         continue;
3786
3787       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3788         {
3789           if (dump_enabled_p ())
3790             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3791                              "intermediate value used outside loop.\n");
3792
3793           return NULL;
3794         }
3795
3796       nphi_def_loop_uses++;
3797       phi_use_stmt = use_stmt;
3798     }
3799
3800   tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
3801   if (TREE_CODE (latch_def) != SSA_NAME)
3802     {
3803       if (dump_enabled_p ())
3804         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3805                          "reduction: not ssa_name: %T\n", latch_def);
3806       return NULL;
3807     }
3808
3809   stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
3810   if (!def_stmt_info
3811       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3812     return NULL;
3813
3814   bool nested_in_vect_loop
3815     = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
3816   unsigned nlatch_def_loop_uses = 0;
3817   auto_vec<gphi *, 3> lcphis;
3818   bool inner_loop_of_double_reduc = false;
3819   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
3820     {
3821       gimple *use_stmt = USE_STMT (use_p);
3822       if (is_gimple_debug (use_stmt))
3823         continue;
3824       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3825         nlatch_def_loop_uses++;
3826       else
3827         {
3828           /* We can have more than one loop-closed PHI.  */
3829           lcphis.safe_push (as_a <gphi *> (use_stmt));
3830           if (nested_in_vect_loop
3831               && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
3832                   == vect_double_reduction_def))
3833             inner_loop_of_double_reduc = true;
3834         }
3835     }
3836
3837   /* If we are vectorizing an inner reduction we are executing that
3838      in the original order only in case we are not dealing with a
3839      double reduction.  */
3840   if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3841     {
3842       if (dump_enabled_p ())
3843         report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3844                         "detected nested cycle: ");
3845       return def_stmt_info;
3846     }
3847
3848   /* When the inner loop of a double reduction ends up with more than
3849      one loop-closed PHI we have failed to classify alternate such
3850      PHIs as double reduction, leading to wrong code.  See PR103237.  */
3851   if (inner_loop_of_double_reduc && lcphis.length () != 1)
3852     {
3853       if (dump_enabled_p ())
3854         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3855                          "unhandle double reduction\n");
3856       return NULL;
3857     }
3858
3859   /* If this isn't a nested cycle or if the nested cycle reduction value
3860      is used ouside of the inner loop we cannot handle uses of the reduction
3861      value.  */
3862   if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3863     {
3864       if (dump_enabled_p ())
3865         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3866                          "reduction used in loop.\n");
3867       return NULL;
3868     }
3869
3870   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3871      defined in the inner loop.  */
3872   if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3873     {
3874       tree op1 = PHI_ARG_DEF (def_stmt, 0);
3875       if (gimple_phi_num_args (def_stmt) != 1
3876           || TREE_CODE (op1) != SSA_NAME)
3877         {
3878           if (dump_enabled_p ())
3879             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3880                              "unsupported phi node definition.\n");
3881
3882           return NULL;
3883         }
3884
3885       gimple *def1 = SSA_NAME_DEF_STMT (op1);
3886       if (gimple_bb (def1)
3887           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3888           && loop->inner
3889           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3890           && (is_gimple_assign (def1) || is_gimple_call (def1))
3891           && is_a <gphi *> (phi_use_stmt)
3892           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3893         {
3894           if (dump_enabled_p ())
3895             report_vect_op (MSG_NOTE, def_stmt,
3896                             "detected double reduction: ");
3897
3898           *double_reduc = true;
3899           return def_stmt_info;
3900         }
3901
3902       return NULL;
3903     }
3904
3905   /* Look for the expression computing latch_def from then loop PHI result.  */
3906   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3907   code_helper code;
3908   if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3909                             path))
3910     {
3911       STMT_VINFO_REDUC_CODE (phi_info) = code;
3912       if (code == COND_EXPR && !nested_in_vect_loop)
3913         STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3914
3915       /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
3916          reduction chain for which the additional restriction is that
3917          all operations in the chain are the same.  */
3918       auto_vec<stmt_vec_info, 8> reduc_chain;
3919       unsigned i;
3920       bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
3921       for (i = path.length () - 1; i >= 1; --i)
3922         {
3923           gimple *stmt = USE_STMT (path[i].second);
3924           stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3925           gimple_match_op op;
3926           if (!gimple_extract_op (stmt, &op))
3927             gcc_unreachable ();
3928           if (gassign *assign = dyn_cast<gassign *> (stmt))
3929             STMT_VINFO_REDUC_IDX (stmt_info)
3930               = path[i].second->use - gimple_assign_rhs1_ptr (assign);
3931           else
3932             {
3933               gcall *call = as_a<gcall *> (stmt);
3934               STMT_VINFO_REDUC_IDX (stmt_info)
3935                 = path[i].second->use - gimple_call_arg_ptr (call, 0);
3936             }
3937           bool leading_conversion = (CONVERT_EXPR_CODE_P (op.code)
3938                                      && (i == 1 || i == path.length () - 1));
3939           if ((op.code != code && !leading_conversion)
3940               /* We can only handle the final value in epilogue
3941                  generation for reduction chains.  */
3942               || (i != 1 && !has_single_use (gimple_get_lhs (stmt))))
3943             is_slp_reduc = false;
3944           /* For reduction chains we support a trailing/leading
3945              conversions.  We do not store those in the actual chain.  */
3946           if (leading_conversion)
3947             continue;
3948           reduc_chain.safe_push (stmt_info);
3949         }
3950       if (slp && is_slp_reduc && reduc_chain.length () > 1)
3951         {
3952           for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
3953             {
3954               REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
3955               REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
3956             }
3957           REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
3958           REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
3959
3960           /* Save the chain for further analysis in SLP detection.  */
3961           LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
3962           REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
3963
3964           *reduc_chain_p = true;
3965           if (dump_enabled_p ())
3966             dump_printf_loc (MSG_NOTE, vect_location,
3967                             "reduction: detected reduction chain\n");
3968         }
3969       else if (dump_enabled_p ())
3970         dump_printf_loc (MSG_NOTE, vect_location,
3971                          "reduction: detected reduction\n");
3972
3973       return def_stmt_info;
3974     }
3975
3976   if (dump_enabled_p ())
3977     dump_printf_loc (MSG_NOTE, vect_location,
3978                      "reduction: unknown pattern\n");
3979
3980   return NULL;
3981 }
3982
3983 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
3984    PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
3985    or -1 if not known.  */
3986
3987 static int
3988 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
3989 {
3990   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3991   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
3992     {
3993       if (dump_enabled_p ())
3994         dump_printf_loc (MSG_NOTE, vect_location,
3995                          "cost model: epilogue peel iters set to vf/2 "
3996                          "because loop iterations are unknown .\n");
3997       return assumed_vf / 2;
3998     }
3999   else
4000     {
4001       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
4002       peel_iters_prologue = MIN (niters, peel_iters_prologue);
4003       int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
4004       /* If we need to peel for gaps, but no peeling is required, we have to
4005          peel VF iterations.  */
4006       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
4007         peel_iters_epilogue = assumed_vf;
4008       return peel_iters_epilogue;
4009     }
4010 }
4011
4012 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
4013 int
4014 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
4015                              int *peel_iters_epilogue,
4016                              stmt_vector_for_cost *scalar_cost_vec,
4017                              stmt_vector_for_cost *prologue_cost_vec,
4018                              stmt_vector_for_cost *epilogue_cost_vec)
4019 {
4020   int retval = 0;
4021
4022   *peel_iters_epilogue
4023     = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
4024
4025   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
4026     {
4027       /* If peeled iterations are known but number of scalar loop
4028          iterations are unknown, count a taken branch per peeled loop.  */
4029       if (peel_iters_prologue > 0)
4030         retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
4031                                    vect_prologue);
4032       if (*peel_iters_epilogue > 0)
4033         retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
4034                                     vect_epilogue);
4035     }
4036
4037   stmt_info_for_cost *si;
4038   int j;
4039   if (peel_iters_prologue)
4040     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4041       retval += record_stmt_cost (prologue_cost_vec,
4042                                   si->count * peel_iters_prologue,
4043                                   si->kind, si->stmt_info, si->misalign,
4044                                   vect_prologue);
4045   if (*peel_iters_epilogue)
4046     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4047       retval += record_stmt_cost (epilogue_cost_vec,
4048                                   si->count * *peel_iters_epilogue,
4049                                   si->kind, si->stmt_info, si->misalign,
4050                                   vect_epilogue);
4051
4052   return retval;
4053 }
4054
4055 /* Function vect_estimate_min_profitable_iters
4056
4057    Return the number of iterations required for the vector version of the
4058    loop to be profitable relative to the cost of the scalar version of the
4059    loop.
4060
4061    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
4062    of iterations for vectorization.  -1 value means loop vectorization
4063    is not profitable.  This returned value may be used for dynamic
4064    profitability check.
4065
4066    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
4067    for static check against estimated number of iterations.  */
4068
4069 static void
4070 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
4071                                     int *ret_min_profitable_niters,
4072                                     int *ret_min_profitable_estimate,
4073                                     unsigned *suggested_unroll_factor)
4074 {
4075   int min_profitable_iters;
4076   int min_profitable_estimate;
4077   int peel_iters_prologue;
4078   int peel_iters_epilogue;
4079   unsigned vec_inside_cost = 0;
4080   int vec_outside_cost = 0;
4081   unsigned vec_prologue_cost = 0;
4082   unsigned vec_epilogue_cost = 0;
4083   int scalar_single_iter_cost = 0;
4084   int scalar_outside_cost = 0;
4085   int assumed_vf = vect_vf_for_cost (loop_vinfo);
4086   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
4087   vector_costs *target_cost_data = loop_vinfo->vector_costs;
4088
4089   /* Cost model disabled.  */
4090   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
4091     {
4092       if (dump_enabled_p ())
4093         dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
4094       *ret_min_profitable_niters = 0;
4095       *ret_min_profitable_estimate = 0;
4096       return;
4097     }
4098
4099   /* Requires loop versioning tests to handle misalignment.  */
4100   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
4101     {
4102       /*  FIXME: Make cost depend on complexity of individual check.  */
4103       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
4104       (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4105       if (dump_enabled_p ())
4106         dump_printf (MSG_NOTE,
4107                      "cost model: Adding cost of checks for loop "
4108                      "versioning to treat misalignment.\n");
4109     }
4110
4111   /* Requires loop versioning with alias checks.  */
4112   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4113     {
4114       /*  FIXME: Make cost depend on complexity of individual check.  */
4115       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
4116       (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4117       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
4118       if (len)
4119         /* Count LEN - 1 ANDs and LEN comparisons.  */
4120         (void) add_stmt_cost (target_cost_data, len * 2 - 1,
4121                               scalar_stmt, vect_prologue);
4122       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
4123       if (len)
4124         {
4125           /* Count LEN - 1 ANDs and LEN comparisons.  */
4126           unsigned int nstmts = len * 2 - 1;
4127           /* +1 for each bias that needs adding.  */
4128           for (unsigned int i = 0; i < len; ++i)
4129             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
4130               nstmts += 1;
4131           (void) add_stmt_cost (target_cost_data, nstmts,
4132                                 scalar_stmt, vect_prologue);
4133         }
4134       if (dump_enabled_p ())
4135         dump_printf (MSG_NOTE,
4136                      "cost model: Adding cost of checks for loop "
4137                      "versioning aliasing.\n");
4138     }
4139
4140   /* Requires loop versioning with niter checks.  */
4141   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
4142     {
4143       /*  FIXME: Make cost depend on complexity of individual check.  */
4144       (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
4145                             NULL, NULL, NULL_TREE, 0, vect_prologue);
4146       if (dump_enabled_p ())
4147         dump_printf (MSG_NOTE,
4148                      "cost model: Adding cost of checks for loop "
4149                      "versioning niters.\n");
4150     }
4151
4152   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4153     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4154                           vect_prologue);
4155
4156   /* Count statements in scalar loop.  Using this as scalar cost for a single
4157      iteration for now.
4158
4159      TODO: Add outer loop support.
4160
4161      TODO: Consider assigning different costs to different scalar
4162      statements.  */
4163
4164   scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost ();
4165
4166   /* Add additional cost for the peeled instructions in prologue and epilogue
4167      loop.  (For fully-masked loops there will be no peeling.)
4168
4169      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4170      at compile-time - we assume it's vf/2 (the worst would be vf-1).
4171
4172      TODO: Build an expression that represents peel_iters for prologue and
4173      epilogue to be used in a run-time test.  */
4174
4175   bool prologue_need_br_taken_cost = false;
4176   bool prologue_need_br_not_taken_cost = false;
4177
4178   /* Calculate peel_iters_prologue.  */
4179   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4180     peel_iters_prologue = 0;
4181   else if (npeel < 0)
4182     {
4183       peel_iters_prologue = assumed_vf / 2;
4184       if (dump_enabled_p ())
4185         dump_printf (MSG_NOTE, "cost model: "
4186                      "prologue peel iters set to vf/2.\n");
4187
4188       /* If peeled iterations are unknown, count a taken branch and a not taken
4189          branch per peeled loop.  Even if scalar loop iterations are known,
4190          vector iterations are not known since peeled prologue iterations are
4191          not known.  Hence guards remain the same.  */
4192       prologue_need_br_taken_cost = true;
4193       prologue_need_br_not_taken_cost = true;
4194     }
4195   else
4196     {
4197       peel_iters_prologue = npeel;
4198       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4199         /* If peeled iterations are known but number of scalar loop
4200            iterations are unknown, count a taken branch per peeled loop.  */
4201         prologue_need_br_taken_cost = true;
4202     }
4203
4204   bool epilogue_need_br_taken_cost = false;
4205   bool epilogue_need_br_not_taken_cost = false;
4206
4207   /* Calculate peel_iters_epilogue.  */
4208   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4209     /* We need to peel exactly one iteration for gaps.  */
4210     peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4211   else if (npeel < 0)
4212     {
4213       /* If peeling for alignment is unknown, loop bound of main loop
4214          becomes unknown.  */
4215       peel_iters_epilogue = assumed_vf / 2;
4216       if (dump_enabled_p ())
4217         dump_printf (MSG_NOTE, "cost model: "
4218                      "epilogue peel iters set to vf/2 because "
4219                      "peeling for alignment is unknown.\n");
4220
4221       /* See the same reason above in peel_iters_prologue calculation.  */
4222       epilogue_need_br_taken_cost = true;
4223       epilogue_need_br_not_taken_cost = true;
4224     }
4225   else
4226     {
4227       peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4228       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4229         /* If peeled iterations are known but number of scalar loop
4230            iterations are unknown, count a taken branch per peeled loop.  */
4231         epilogue_need_br_taken_cost = true;
4232     }
4233
4234   stmt_info_for_cost *si;
4235   int j;
4236   /* Add costs associated with peel_iters_prologue.  */
4237   if (peel_iters_prologue)
4238     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4239       {
4240         (void) add_stmt_cost (target_cost_data,
4241                               si->count * peel_iters_prologue, si->kind,
4242                               si->stmt_info, si->node, si->vectype,
4243                               si->misalign, vect_prologue);
4244       }
4245
4246   /* Add costs associated with peel_iters_epilogue.  */
4247   if (peel_iters_epilogue)
4248     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4249       {
4250         (void) add_stmt_cost (target_cost_data,
4251                               si->count * peel_iters_epilogue, si->kind,
4252                               si->stmt_info, si->node, si->vectype,
4253                               si->misalign, vect_epilogue);
4254       }
4255
4256   /* Add possible cond_branch_taken/cond_branch_not_taken cost.  */
4257
4258   if (prologue_need_br_taken_cost)
4259     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4260                           vect_prologue);
4261
4262   if (prologue_need_br_not_taken_cost)
4263     (void) add_stmt_cost (target_cost_data, 1,
4264                           cond_branch_not_taken, vect_prologue);
4265
4266   if (epilogue_need_br_taken_cost)
4267     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4268                           vect_epilogue);
4269
4270   if (epilogue_need_br_not_taken_cost)
4271     (void) add_stmt_cost (target_cost_data, 1,
4272                           cond_branch_not_taken, vect_epilogue);
4273
4274   /* Take care of special costs for rgroup controls of partial vectors.  */
4275   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
4276     {
4277       /* Calculate how many masks we need to generate.  */
4278       unsigned int num_masks = 0;
4279       rgroup_controls *rgm;
4280       unsigned int num_vectors_m1;
4281       FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
4282         if (rgm->type)
4283           num_masks += num_vectors_m1 + 1;
4284       gcc_assert (num_masks > 0);
4285
4286       /* In the worst case, we need to generate each mask in the prologue
4287          and in the loop body.  One of the loop body mask instructions
4288          replaces the comparison in the scalar loop, and since we don't
4289          count the scalar comparison against the scalar body, we shouldn't
4290          count that vector instruction against the vector body either.
4291
4292          Sometimes we can use unpacks instead of generating prologue
4293          masks and sometimes the prologue mask will fold to a constant,
4294          so the actual prologue cost might be smaller.  However, it's
4295          simpler and safer to use the worst-case cost; if this ends up
4296          being the tie-breaker between vectorizing or not, then it's
4297          probably better not to vectorize.  */
4298       (void) add_stmt_cost (target_cost_data, num_masks,
4299                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4300                             vect_prologue);
4301       (void) add_stmt_cost (target_cost_data, num_masks - 1,
4302                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4303                             vect_body);
4304     }
4305   else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4306     {
4307       /* Referring to the functions vect_set_loop_condition_partial_vectors
4308          and vect_set_loop_controls_directly, we need to generate each
4309          length in the prologue and in the loop body if required. Although
4310          there are some possible optimizations, we consider the worst case
4311          here.  */
4312
4313       bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4314       signed char partial_load_store_bias
4315         = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4316       bool need_iterate_p
4317         = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4318            && !vect_known_niters_smaller_than_vf (loop_vinfo));
4319
4320       /* Calculate how many statements to be added.  */
4321       unsigned int prologue_stmts = 0;
4322       unsigned int body_stmts = 0;
4323
4324       rgroup_controls *rgc;
4325       unsigned int num_vectors_m1;
4326       FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4327         if (rgc->type)
4328           {
4329             /* May need one SHIFT for nitems_total computation.  */
4330             unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4331             if (nitems != 1 && !niters_known_p)
4332               prologue_stmts += 1;
4333
4334             /* May need one MAX and one MINUS for wrap around.  */
4335             if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4336               prologue_stmts += 2;
4337
4338             /* Need one MAX and one MINUS for each batch limit excepting for
4339                the 1st one.  */
4340             prologue_stmts += num_vectors_m1 * 2;
4341
4342             unsigned int num_vectors = num_vectors_m1 + 1;
4343
4344             /* Need to set up lengths in prologue, only one MIN required
4345                for each since start index is zero.  */
4346             prologue_stmts += num_vectors;
4347
4348             /* If we have a non-zero partial load bias, we need one PLUS
4349                to adjust the load length.  */
4350             if (partial_load_store_bias != 0)
4351               body_stmts += 1;
4352
4353             /* Each may need two MINs and one MINUS to update lengths in body
4354                for next iteration.  */
4355             if (need_iterate_p)
4356               body_stmts += 3 * num_vectors;
4357           }
4358
4359       (void) add_stmt_cost (target_cost_data, prologue_stmts,
4360                             scalar_stmt, vect_prologue);
4361       (void) add_stmt_cost (target_cost_data, body_stmts,
4362                             scalar_stmt, vect_body);
4363     }
4364
4365   /* FORNOW: The scalar outside cost is incremented in one of the
4366      following ways:
4367
4368      1. The vectorizer checks for alignment and aliasing and generates
4369      a condition that allows dynamic vectorization.  A cost model
4370      check is ANDED with the versioning condition.  Hence scalar code
4371      path now has the added cost of the versioning check.
4372
4373        if (cost > th & versioning_check)
4374          jmp to vector code
4375
4376      Hence run-time scalar is incremented by not-taken branch cost.
4377
4378      2. The vectorizer then checks if a prologue is required.  If the
4379      cost model check was not done before during versioning, it has to
4380      be done before the prologue check.
4381
4382        if (cost <= th)
4383          prologue = scalar_iters
4384        if (prologue == 0)
4385          jmp to vector code
4386        else
4387          execute prologue
4388        if (prologue == num_iters)
4389          go to exit
4390
4391      Hence the run-time scalar cost is incremented by a taken branch,
4392      plus a not-taken branch, plus a taken branch cost.
4393
4394      3. The vectorizer then checks if an epilogue is required.  If the
4395      cost model check was not done before during prologue check, it
4396      has to be done with the epilogue check.
4397
4398        if (prologue == 0)
4399          jmp to vector code
4400        else
4401          execute prologue
4402        if (prologue == num_iters)
4403          go to exit
4404        vector code:
4405          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4406            jmp to epilogue
4407
4408      Hence the run-time scalar cost should be incremented by 2 taken
4409      branches.
4410
4411      TODO: The back end may reorder the BBS's differently and reverse
4412      conditions/branch directions.  Change the estimates below to
4413      something more reasonable.  */
4414
4415   /* If the number of iterations is known and we do not do versioning, we can
4416      decide whether to vectorize at compile time.  Hence the scalar version
4417      do not carry cost model guard costs.  */
4418   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4419       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4420     {
4421       /* Cost model check occurs at versioning.  */
4422       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4423         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4424       else
4425         {
4426           /* Cost model check occurs at prologue generation.  */
4427           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4428             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4429               + vect_get_stmt_cost (cond_branch_not_taken);
4430           /* Cost model check occurs at epilogue generation.  */
4431           else
4432             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4433         }
4434     }
4435
4436   /* Complete the target-specific cost calculations.  */
4437   finish_cost (loop_vinfo->vector_costs, loop_vinfo->scalar_costs,
4438                &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost,
4439                suggested_unroll_factor);
4440
4441   if (suggested_unroll_factor && *suggested_unroll_factor > 1
4442       && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
4443       && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
4444                     *suggested_unroll_factor,
4445                     LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
4446     {
4447       if (dump_enabled_p ())
4448         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4449                          "can't unroll as unrolled vectorization factor larger"
4450                          " than maximum vectorization factor: "
4451                          HOST_WIDE_INT_PRINT_UNSIGNED "\n",
4452                          LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
4453       *suggested_unroll_factor = 1;
4454     }
4455
4456   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4457
4458   if (dump_enabled_p ())
4459     {
4460       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4461       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
4462                    vec_inside_cost);
4463       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
4464                    vec_prologue_cost);
4465       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
4466                    vec_epilogue_cost);
4467       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
4468                    scalar_single_iter_cost);
4469       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
4470                    scalar_outside_cost);
4471       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
4472                    vec_outside_cost);
4473       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
4474                    peel_iters_prologue);
4475       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
4476                    peel_iters_epilogue);
4477     }
4478
4479   /* Calculate number of iterations required to make the vector version
4480      profitable, relative to the loop bodies only.  The following condition
4481      must hold true:
4482      SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4483      where
4484      SIC = scalar iteration cost, VIC = vector iteration cost,
4485      VOC = vector outside cost, VF = vectorization factor,
4486      NPEEL = prologue iterations + epilogue iterations,
4487      SOC = scalar outside cost for run time cost model check.  */
4488
4489   int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4490                           - vec_inside_cost);
4491   if (saving_per_viter <= 0)
4492     {
4493       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4494         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4495                     "vectorization did not happen for a simd loop");
4496
4497       if (dump_enabled_p ())
4498         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4499                          "cost model: the vector iteration cost = %d "
4500                          "divided by the scalar iteration cost = %d "
4501                          "is greater or equal to the vectorization factor = %d"
4502                          ".\n",
4503                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4504       *ret_min_profitable_niters = -1;
4505       *ret_min_profitable_estimate = -1;
4506       return;
4507     }
4508
4509   /* ??? The "if" arm is written to handle all cases; see below for what
4510      we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
4511   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4512     {
4513       /* Rewriting the condition above in terms of the number of
4514          vector iterations (vniters) rather than the number of
4515          scalar iterations (niters) gives:
4516
4517          SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
4518
4519          <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
4520
4521          For integer N, X and Y when X > 0:
4522
4523          N * X > Y <==> N >= (Y /[floor] X) + 1.  */
4524       int outside_overhead = (vec_outside_cost
4525                               - scalar_single_iter_cost * peel_iters_prologue
4526                               - scalar_single_iter_cost * peel_iters_epilogue
4527                               - scalar_outside_cost);
4528       /* We're only interested in cases that require at least one
4529          vector iteration.  */
4530       int min_vec_niters = 1;
4531       if (outside_overhead > 0)
4532         min_vec_niters = outside_overhead / saving_per_viter + 1;
4533
4534       if (dump_enabled_p ())
4535         dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
4536                      min_vec_niters);
4537
4538       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4539         {
4540           /* Now that we know the minimum number of vector iterations,
4541              find the minimum niters for which the scalar cost is larger:
4542
4543              SIC * niters > VIC * vniters + VOC - SOC
4544
4545              We know that the minimum niters is no more than
4546              vniters * VF + NPEEL, but it might be (and often is) less
4547              than that if a partial vector iteration is cheaper than the
4548              equivalent scalar code.  */
4549           int threshold = (vec_inside_cost * min_vec_niters
4550                            + vec_outside_cost
4551                            - scalar_outside_cost);
4552           if (threshold <= 0)
4553             min_profitable_iters = 1;
4554           else
4555             min_profitable_iters = threshold / scalar_single_iter_cost + 1;
4556         }
4557       else
4558         /* Convert the number of vector iterations into a number of
4559            scalar iterations.  */
4560         min_profitable_iters = (min_vec_niters * assumed_vf
4561                                 + peel_iters_prologue
4562                                 + peel_iters_epilogue);
4563     }
4564   else
4565     {
4566       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4567                               * assumed_vf
4568                               - vec_inside_cost * peel_iters_prologue
4569                               - vec_inside_cost * peel_iters_epilogue);
4570       if (min_profitable_iters <= 0)
4571         min_profitable_iters = 0;
4572       else
4573         {
4574           min_profitable_iters /= saving_per_viter;
4575
4576           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4577               <= (((int) vec_inside_cost * min_profitable_iters)
4578                   + (((int) vec_outside_cost - scalar_outside_cost)
4579                      * assumed_vf)))
4580             min_profitable_iters++;
4581         }
4582     }
4583
4584   if (dump_enabled_p ())
4585     dump_printf (MSG_NOTE,
4586                  "  Calculated minimum iters for profitability: %d\n",
4587                  min_profitable_iters);
4588
4589   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
4590       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4591     /* We want the vectorized loop to execute at least once.  */
4592     min_profitable_iters = assumed_vf + peel_iters_prologue;
4593   else if (min_profitable_iters < peel_iters_prologue)
4594     /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4595        vectorized loop executes at least once.  */
4596     min_profitable_iters = peel_iters_prologue;
4597
4598   if (dump_enabled_p ())
4599     dump_printf_loc (MSG_NOTE, vect_location,
4600                      "  Runtime profitability threshold = %d\n",
4601                      min_profitable_iters);
4602
4603   *ret_min_profitable_niters = min_profitable_iters;
4604
4605   /* Calculate number of iterations required to make the vector version
4606      profitable, relative to the loop bodies only.
4607
4608      Non-vectorized variant is SIC * niters and it must win over vector
4609      variant on the expected loop trip count.  The following condition must hold true:
4610      SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
4611
4612   if (vec_outside_cost <= 0)
4613     min_profitable_estimate = 0;
4614   /* ??? This "else if" arm is written to handle all cases; see below for
4615      what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
4616   else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4617     {
4618       /* This is a repeat of the code above, but with + SOC rather
4619          than - SOC.  */
4620       int outside_overhead = (vec_outside_cost
4621                               - scalar_single_iter_cost * peel_iters_prologue
4622                               - scalar_single_iter_cost * peel_iters_epilogue
4623                               + scalar_outside_cost);
4624       int min_vec_niters = 1;
4625       if (outside_overhead > 0)
4626         min_vec_niters = outside_overhead / saving_per_viter + 1;
4627
4628       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4629         {
4630           int threshold = (vec_inside_cost * min_vec_niters
4631                            + vec_outside_cost
4632                            + scalar_outside_cost);
4633           min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
4634         }
4635       else
4636         min_profitable_estimate = (min_vec_niters * assumed_vf
4637                                    + peel_iters_prologue
4638                                    + peel_iters_epilogue);
4639     }
4640   else
4641     {
4642       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4643                                  * assumed_vf
4644                                  - vec_inside_cost * peel_iters_prologue
4645                                  - vec_inside_cost * peel_iters_epilogue)
4646                                  / ((scalar_single_iter_cost * assumed_vf)
4647                                    - vec_inside_cost);
4648     }
4649   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4650   if (dump_enabled_p ())
4651     dump_printf_loc (MSG_NOTE, vect_location,
4652                      "  Static estimate profitability threshold = %d\n",
4653                      min_profitable_estimate);
4654
4655   *ret_min_profitable_estimate = min_profitable_estimate;
4656 }
4657
4658 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4659    vector elements (not bits) for a vector with NELT elements.  */
4660 static void
4661 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4662                               vec_perm_builder *sel)
4663 {
4664   /* The encoding is a single stepped pattern.  Any wrap-around is handled
4665      by vec_perm_indices.  */
4666   sel->new_vector (nelt, 1, 3);
4667   for (unsigned int i = 0; i < 3; i++)
4668     sel->quick_push (i + offset);
4669 }
4670
4671 /* Checks whether the target supports whole-vector shifts for vectors of mode
4672    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
4673    it supports vec_perm_const with masks for all necessary shift amounts.  */
4674 static bool
4675 have_whole_vector_shift (machine_mode mode)
4676 {
4677   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4678     return true;
4679
4680   /* Variable-length vectors should be handled via the optab.  */
4681   unsigned int nelt;
4682   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4683     return false;
4684
4685   vec_perm_builder sel;
4686   vec_perm_indices indices;
4687   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4688     {
4689       calc_vec_perm_mask_for_shift (i, nelt, &sel);
4690       indices.new_vector (sel, 2, nelt);
4691       if (!can_vec_perm_const_p (mode, mode, indices, false))
4692         return false;
4693     }
4694   return true;
4695 }
4696
4697 /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
4698    multiplication operands have differing signs and (b) we intend
4699    to emulate the operation using a series of signed DOT_PROD_EXPRs.
4700    See vect_emulate_mixed_dot_prod for the actual sequence used.  */
4701
4702 static bool
4703 vect_is_emulated_mixed_dot_prod (loop_vec_info loop_vinfo,
4704                                  stmt_vec_info stmt_info)
4705 {
4706   gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
4707   if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
4708     return false;
4709
4710   tree rhs1 = gimple_assign_rhs1 (assign);
4711   tree rhs2 = gimple_assign_rhs2 (assign);
4712   if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
4713     return false;
4714
4715   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
4716   gcc_assert (reduc_info->is_reduc_info);
4717   return !directly_supported_p (DOT_PROD_EXPR,
4718                                 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info),
4719                                 optab_vector_mixed_sign);
4720 }
4721
4722 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4723    functions. Design better to avoid maintenance issues.  */
4724
4725 /* Function vect_model_reduction_cost.
4726
4727    Models cost for a reduction operation, including the vector ops
4728    generated within the strip-mine loop in some cases, the initial
4729    definition before the loop, and the epilogue code that must be generated.  */
4730
4731 static void
4732 vect_model_reduction_cost (loop_vec_info loop_vinfo,
4733                            stmt_vec_info stmt_info, internal_fn reduc_fn,
4734                            vect_reduction_type reduction_type,
4735                            int ncopies, stmt_vector_for_cost *cost_vec)
4736 {
4737   int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
4738   tree vectype;
4739   machine_mode mode;
4740   class loop *loop = NULL;
4741
4742   if (loop_vinfo)
4743     loop = LOOP_VINFO_LOOP (loop_vinfo);
4744
4745   /* Condition reductions generate two reductions in the loop.  */
4746   if (reduction_type == COND_REDUCTION)
4747     ncopies *= 2;
4748
4749   vectype = STMT_VINFO_VECTYPE (stmt_info);
4750   mode = TYPE_MODE (vectype);
4751   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4752
4753   gimple_match_op op;
4754   if (!gimple_extract_op (orig_stmt_info->stmt, &op))
4755     gcc_unreachable ();
4756
4757   bool emulated_mixed_dot_prod
4758     = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
4759   if (reduction_type == EXTRACT_LAST_REDUCTION)
4760     /* No extra instructions are needed in the prologue.  The loop body
4761        operations are costed in vectorizable_condition.  */
4762     inside_cost = 0;
4763   else if (reduction_type == FOLD_LEFT_REDUCTION)
4764     {
4765       /* No extra instructions needed in the prologue.  */
4766       prologue_cost = 0;
4767
4768       if (reduc_fn != IFN_LAST)
4769         /* Count one reduction-like operation per vector.  */
4770         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
4771                                         stmt_info, 0, vect_body);
4772       else
4773         {
4774           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
4775           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4776           inside_cost = record_stmt_cost (cost_vec, nelements,
4777                                           vec_to_scalar, stmt_info, 0,
4778                                           vect_body);
4779           inside_cost += record_stmt_cost (cost_vec, nelements,
4780                                            scalar_stmt, stmt_info, 0,
4781                                            vect_body);
4782         }
4783     }
4784   else
4785     {
4786       /* Add in the cost of the initial definitions.  */
4787       int prologue_stmts;
4788       if (reduction_type == COND_REDUCTION)
4789         /* For cond reductions we have four vectors: initial index, step,
4790            initial result of the data reduction, initial value of the index
4791            reduction.  */
4792         prologue_stmts = 4;
4793       else if (emulated_mixed_dot_prod)
4794         /* We need the initial reduction value and two invariants:
4795            one that contains the minimum signed value and one that
4796            contains half of its negative.  */
4797         prologue_stmts = 3;
4798       else
4799         prologue_stmts = 1;
4800       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
4801                                          scalar_to_vec, stmt_info, 0,
4802                                          vect_prologue);
4803     }
4804
4805   /* Determine cost of epilogue code.
4806
4807      We have a reduction operator that will reduce the vector in one statement.
4808      Also requires scalar extract.  */
4809
4810   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4811     {
4812       if (reduc_fn != IFN_LAST)
4813         {
4814           if (reduction_type == COND_REDUCTION)
4815             {
4816               /* An EQ stmt and an COND_EXPR stmt.  */
4817               epilogue_cost += record_stmt_cost (cost_vec, 2,
4818                                                  vector_stmt, stmt_info, 0,
4819                                                  vect_epilogue);
4820               /* Reduction of the max index and a reduction of the found
4821                  values.  */
4822               epilogue_cost += record_stmt_cost (cost_vec, 2,
4823                                                  vec_to_scalar, stmt_info, 0,
4824                                                  vect_epilogue);
4825               /* A broadcast of the max value.  */
4826               epilogue_cost += record_stmt_cost (cost_vec, 1,
4827                                                  scalar_to_vec, stmt_info, 0,
4828                                                  vect_epilogue);
4829             }
4830           else
4831             {
4832               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4833                                                  stmt_info, 0, vect_epilogue);
4834               epilogue_cost += record_stmt_cost (cost_vec, 1,
4835                                                  vec_to_scalar, stmt_info, 0,
4836                                                  vect_epilogue);
4837             }
4838         }
4839       else if (reduction_type == COND_REDUCTION)
4840         {
4841           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4842           /* Extraction of scalar elements.  */
4843           epilogue_cost += record_stmt_cost (cost_vec,
4844                                              2 * estimated_nunits,
4845                                              vec_to_scalar, stmt_info, 0,
4846                                              vect_epilogue);
4847           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
4848           epilogue_cost += record_stmt_cost (cost_vec,
4849                                              2 * estimated_nunits - 3,
4850                                              scalar_stmt, stmt_info, 0,
4851                                              vect_epilogue);
4852         }
4853       else if (reduction_type == EXTRACT_LAST_REDUCTION
4854                || reduction_type == FOLD_LEFT_REDUCTION)
4855         /* No extra instructions need in the epilogue.  */
4856         ;
4857       else
4858         {
4859           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4860           tree bitsize = TYPE_SIZE (op.type);
4861           int element_bitsize = tree_to_uhwi (bitsize);
4862           int nelements = vec_size_in_bits / element_bitsize;
4863
4864           if (op.code == COND_EXPR)
4865             op.code = MAX_EXPR;
4866
4867           /* We have a whole vector shift available.  */
4868           if (VECTOR_MODE_P (mode)
4869               && directly_supported_p (op.code, vectype)
4870               && have_whole_vector_shift (mode))
4871             {
4872               /* Final reduction via vector shifts and the reduction operator.
4873                  Also requires scalar extract.  */
4874               epilogue_cost += record_stmt_cost (cost_vec,
4875                                                  exact_log2 (nelements) * 2,
4876                                                  vector_stmt, stmt_info, 0,
4877                                                  vect_epilogue);
4878               epilogue_cost += record_stmt_cost (cost_vec, 1,
4879                                                  vec_to_scalar, stmt_info, 0,
4880                                                  vect_epilogue);
4881             }
4882           else
4883             /* Use extracts and reduction op for final reduction.  For N
4884                elements, we have N extracts and N-1 reduction ops.  */
4885             epilogue_cost += record_stmt_cost (cost_vec,
4886                                                nelements + nelements - 1,
4887                                                vector_stmt, stmt_info, 0,
4888                                                vect_epilogue);
4889         }
4890     }
4891
4892   if (dump_enabled_p ())
4893     dump_printf (MSG_NOTE,
4894                  "vect_model_reduction_cost: inside_cost = %d, "
4895                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4896                  prologue_cost, epilogue_cost);
4897 }
4898
4899 /* SEQ is a sequence of instructions that initialize the reduction
4900    described by REDUC_INFO.  Emit them in the appropriate place.  */
4901
4902 static void
4903 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
4904                                 stmt_vec_info reduc_info, gimple *seq)
4905 {
4906   if (reduc_info->reused_accumulator)
4907     {
4908       /* When reusing an accumulator from the main loop, we only need
4909          initialization instructions if the main loop can be skipped.
4910          In that case, emit the initialization instructions at the end
4911          of the guard block that does the skip.  */
4912       edge skip_edge = loop_vinfo->skip_main_loop_edge;
4913       gcc_assert (skip_edge);
4914       gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
4915       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
4916     }
4917   else
4918     {
4919       /* The normal case: emit the initialization instructions on the
4920          preheader edge.  */
4921       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4922       gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
4923     }
4924 }
4925
4926 /* Function get_initial_def_for_reduction
4927
4928    Input:
4929    REDUC_INFO - the info_for_reduction
4930    INIT_VAL - the initial value of the reduction variable
4931    NEUTRAL_OP - a value that has no effect on the reduction, as per
4932                 neutral_op_for_reduction
4933
4934    Output:
4935    Return a vector variable, initialized according to the operation that
4936         STMT_VINFO performs. This vector will be used as the initial value
4937         of the vector of partial results.
4938
4939    The value we need is a vector in which element 0 has value INIT_VAL
4940    and every other element has value NEUTRAL_OP.  */
4941
4942 static tree
4943 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
4944                                stmt_vec_info reduc_info,
4945                                tree init_val, tree neutral_op)
4946 {
4947   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4948   tree scalar_type = TREE_TYPE (init_val);
4949   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
4950   tree init_def;
4951   gimple_seq stmts = NULL;
4952
4953   gcc_assert (vectype);
4954
4955   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4956               || SCALAR_FLOAT_TYPE_P (scalar_type));
4957
4958   gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
4959               || loop == (gimple_bb (reduc_info->stmt))->loop_father);
4960
4961   if (operand_equal_p (init_val, neutral_op))
4962     {
4963       /* If both elements are equal then the vector described above is
4964          just a splat.  */
4965       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
4966       init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
4967     }
4968   else
4969     {
4970       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
4971       init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4972       if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4973         {
4974           /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
4975              element 0.  */
4976           init_def = gimple_build_vector_from_val (&stmts, vectype,
4977                                                    neutral_op);
4978           init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4979                                    vectype, init_def, init_val);
4980         }
4981       else
4982         {
4983           /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}.  */
4984           tree_vector_builder elts (vectype, 1, 2);
4985           elts.quick_push (init_val);
4986           elts.quick_push (neutral_op);
4987           init_def = gimple_build_vector (&stmts, &elts);
4988         }
4989     }
4990
4991   if (stmts)
4992     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
4993   return init_def;
4994 }
4995
4996 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
4997    which performs a reduction involving GROUP_SIZE scalar statements.
4998    NUMBER_OF_VECTORS is the number of vector defs to create.  If NEUTRAL_OP
4999    is nonnull, introducing extra elements of that value will not change the
5000    result.  */
5001
5002 static void
5003 get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
5004                                 stmt_vec_info reduc_info,
5005                                 vec<tree> *vec_oprnds,
5006                                 unsigned int number_of_vectors,
5007                                 unsigned int group_size, tree neutral_op)
5008 {
5009   vec<tree> &initial_values = reduc_info->reduc_initial_values;
5010   unsigned HOST_WIDE_INT nunits;
5011   unsigned j, number_of_places_left_in_vector;
5012   tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
5013   unsigned int i;
5014
5015   gcc_assert (group_size == initial_values.length () || neutral_op);
5016
5017   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5018      created vectors. It is greater than 1 if unrolling is performed.
5019
5020      For example, we have two scalar operands, s1 and s2 (e.g., group of
5021      strided accesses of size two), while NUNITS is four (i.e., four scalars
5022      of this type can be packed in a vector).  The output vector will contain
5023      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
5024      will be 2).
5025
5026      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
5027      vectors containing the operands.
5028
5029      For example, NUNITS is four as before, and the group size is 8
5030      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
5031      {s5, s6, s7, s8}.  */
5032
5033   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
5034     nunits = group_size;
5035
5036   number_of_places_left_in_vector = nunits;
5037   bool constant_p = true;
5038   tree_vector_builder elts (vector_type, nunits, 1);
5039   elts.quick_grow (nunits);
5040   gimple_seq ctor_seq = NULL;
5041   for (j = 0; j < nunits * number_of_vectors; ++j)
5042     {
5043       tree op;
5044       i = j % group_size;
5045
5046       /* Get the def before the loop.  In reduction chain we have only
5047          one initial value.  Else we have as many as PHIs in the group.  */
5048       if (i >= initial_values.length () || (j > i && neutral_op))
5049         op = neutral_op;
5050       else
5051         op = initial_values[i];
5052
5053       /* Create 'vect_ = {op0,op1,...,opn}'.  */
5054       number_of_places_left_in_vector--;
5055       elts[nunits - number_of_places_left_in_vector - 1] = op;
5056       if (!CONSTANT_CLASS_P (op))
5057         constant_p = false;
5058
5059       if (number_of_places_left_in_vector == 0)
5060         {
5061           tree init;
5062           if (constant_p && !neutral_op
5063               ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
5064               : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
5065             /* Build the vector directly from ELTS.  */
5066             init = gimple_build_vector (&ctor_seq, &elts);
5067           else if (neutral_op)
5068             {
5069               /* Build a vector of the neutral value and shift the
5070                  other elements into place.  */
5071               init = gimple_build_vector_from_val (&ctor_seq, vector_type,
5072                                                    neutral_op);
5073               int k = nunits;
5074               while (k > 0 && elts[k - 1] == neutral_op)
5075                 k -= 1;
5076               while (k > 0)
5077                 {
5078                   k -= 1;
5079                   init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
5080                                        vector_type, init, elts[k]);
5081                 }
5082             }
5083           else
5084             {
5085               /* First time round, duplicate ELTS to fill the
5086                  required number of vectors.  */
5087               duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
5088                                         elts, number_of_vectors, *vec_oprnds);
5089               break;
5090             }
5091           vec_oprnds->quick_push (init);
5092
5093           number_of_places_left_in_vector = nunits;
5094           elts.new_vector (vector_type, nunits, 1);
5095           elts.quick_grow (nunits);
5096           constant_p = true;
5097         }
5098     }
5099   if (ctor_seq != NULL)
5100     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
5101 }
5102
5103 /* For a statement STMT_INFO taking part in a reduction operation return
5104    the stmt_vec_info the meta information is stored on.  */
5105
5106 stmt_vec_info
5107 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
5108 {
5109   stmt_info = vect_orig_stmt (stmt_info);
5110   gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
5111   if (!is_a <gphi *> (stmt_info->stmt)
5112       || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
5113     stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5114   gphi *phi = as_a <gphi *> (stmt_info->stmt);
5115   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5116     {
5117       if (gimple_phi_num_args (phi) == 1)
5118         stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5119     }
5120   else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
5121     {
5122       stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
5123       if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
5124         stmt_info = info;
5125     }
5126   return stmt_info;
5127 }
5128
5129 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
5130    REDUC_INFO can build on.  Adjust REDUC_INFO and return true if so, otherwise
5131    return false.  */
5132
5133 static bool
5134 vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
5135                                 stmt_vec_info reduc_info)
5136 {
5137   loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
5138   if (!main_loop_vinfo)
5139     return false;
5140
5141   if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
5142     return false;
5143
5144   unsigned int num_phis = reduc_info->reduc_initial_values.length ();
5145   auto_vec<tree, 16> main_loop_results (num_phis);
5146   auto_vec<tree, 16> initial_values (num_phis);
5147   if (edge main_loop_edge = loop_vinfo->main_loop_edge)
5148     {
5149       /* The epilogue loop can be entered either from the main loop or
5150          from an earlier guard block.  */
5151       edge skip_edge = loop_vinfo->skip_main_loop_edge;
5152       for (tree incoming_value : reduc_info->reduc_initial_values)
5153         {
5154           /* Look for:
5155
5156                INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
5157                                     INITIAL_VALUE(guard block)>.  */
5158           gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
5159
5160           gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
5161           gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
5162
5163           tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
5164           tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
5165
5166           main_loop_results.quick_push (from_main_loop);
5167           initial_values.quick_push (from_skip);
5168         }
5169     }
5170   else
5171     /* The main loop dominates the epilogue loop.  */
5172     main_loop_results.splice (reduc_info->reduc_initial_values);
5173
5174   /* See if the main loop has the kind of accumulator we need.  */
5175   vect_reusable_accumulator *accumulator
5176     = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
5177   if (!accumulator
5178       || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
5179       || !std::equal (main_loop_results.begin (), main_loop_results.end (),
5180                       accumulator->reduc_info->reduc_scalar_results.begin ()))
5181     return false;
5182
5183   /* Handle the case where we can reduce wider vectors to narrower ones.  */
5184   tree vectype = STMT_VINFO_VECTYPE (reduc_info);
5185   tree old_vectype = TREE_TYPE (accumulator->reduc_input);
5186   unsigned HOST_WIDE_INT m;
5187   if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
5188                             TYPE_VECTOR_SUBPARTS (vectype), &m))
5189     return false;
5190   /* Check the intermediate vector types and operations are available.  */
5191   tree prev_vectype = old_vectype;
5192   poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
5193   while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
5194     {
5195       intermediate_nunits = exact_div (intermediate_nunits, 2);
5196       tree intermediate_vectype = get_related_vectype_for_scalar_type
5197         (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
5198       if (!intermediate_vectype
5199           || !directly_supported_p (STMT_VINFO_REDUC_CODE (reduc_info),
5200                                     intermediate_vectype)
5201           || !can_vec_extract (TYPE_MODE (prev_vectype),
5202                                TYPE_MODE (intermediate_vectype)))
5203         return false;
5204       prev_vectype = intermediate_vectype;
5205     }
5206
5207   /* Non-SLP reductions might apply an adjustment after the reduction
5208      operation, in order to simplify the initialization of the accumulator.
5209      If the epilogue loop carries on from where the main loop left off,
5210      it should apply the same adjustment to the final reduction result.
5211
5212      If the epilogue loop can also be entered directly (rather than via
5213      the main loop), we need to be able to handle that case in the same way,
5214      with the same adjustment.  (In principle we could add a PHI node
5215      to select the correct adjustment, but in practice that shouldn't be
5216      necessary.)  */
5217   tree main_adjustment
5218     = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5219   if (loop_vinfo->main_loop_edge && main_adjustment)
5220     {
5221       gcc_assert (num_phis == 1);
5222       tree initial_value = initial_values[0];
5223       /* Check that we can use INITIAL_VALUE as the adjustment and
5224          initialize the accumulator with a neutral value instead.  */
5225       if (!operand_equal_p (initial_value, main_adjustment))
5226         return false;
5227       code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5228       initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
5229                                                     code, initial_value);
5230     }
5231   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5232   reduc_info->reduc_initial_values.truncate (0);
5233   reduc_info->reduc_initial_values.splice (initial_values);
5234   reduc_info->reused_accumulator = accumulator;
5235   return true;
5236 }
5237
5238 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5239    CODE emitting stmts before GSI.  Returns a vector def of VECTYPE.  */
5240
5241 static tree
5242 vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5243                             gimple_seq *seq)
5244 {
5245   unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5246   unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5247   tree stype = TREE_TYPE (vectype);
5248   tree new_temp = vec_def;
5249   while (nunits > nunits1)
5250     {
5251       nunits /= 2;
5252       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5253                                                            stype, nunits);
5254       unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5255
5256       /* The target has to make sure we support lowpart/highpart
5257          extraction, either via direct vector extract or through
5258          an integer mode punning.  */
5259       tree dst1, dst2;
5260       gimple *epilog_stmt;
5261       if (convert_optab_handler (vec_extract_optab,
5262                                  TYPE_MODE (TREE_TYPE (new_temp)),
5263                                  TYPE_MODE (vectype1))
5264           != CODE_FOR_nothing)
5265         {
5266           /* Extract sub-vectors directly once vec_extract becomes
5267              a conversion optab.  */
5268           dst1 = make_ssa_name (vectype1);
5269           epilog_stmt
5270               = gimple_build_assign (dst1, BIT_FIELD_REF,
5271                                      build3 (BIT_FIELD_REF, vectype1,
5272                                              new_temp, TYPE_SIZE (vectype1),
5273                                              bitsize_int (0)));
5274           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5275           dst2 =  make_ssa_name (vectype1);
5276           epilog_stmt
5277               = gimple_build_assign (dst2, BIT_FIELD_REF,
5278                                      build3 (BIT_FIELD_REF, vectype1,
5279                                              new_temp, TYPE_SIZE (vectype1),
5280                                              bitsize_int (bitsize)));
5281           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5282         }
5283       else
5284         {
5285           /* Extract via punning to appropriately sized integer mode
5286              vector.  */
5287           tree eltype = build_nonstandard_integer_type (bitsize, 1);
5288           tree etype = build_vector_type (eltype, 2);
5289           gcc_assert (convert_optab_handler (vec_extract_optab,
5290                                              TYPE_MODE (etype),
5291                                              TYPE_MODE (eltype))
5292                       != CODE_FOR_nothing);
5293           tree tem = make_ssa_name (etype);
5294           epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5295                                              build1 (VIEW_CONVERT_EXPR,
5296                                                      etype, new_temp));
5297           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5298           new_temp = tem;
5299           tem = make_ssa_name (eltype);
5300           epilog_stmt
5301               = gimple_build_assign (tem, BIT_FIELD_REF,
5302                                      build3 (BIT_FIELD_REF, eltype,
5303                                              new_temp, TYPE_SIZE (eltype),
5304                                              bitsize_int (0)));
5305           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5306           dst1 = make_ssa_name (vectype1);
5307           epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5308                                              build1 (VIEW_CONVERT_EXPR,
5309                                                      vectype1, tem));
5310           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5311           tem = make_ssa_name (eltype);
5312           epilog_stmt
5313               = gimple_build_assign (tem, BIT_FIELD_REF,
5314                                      build3 (BIT_FIELD_REF, eltype,
5315                                              new_temp, TYPE_SIZE (eltype),
5316                                              bitsize_int (bitsize)));
5317           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5318           dst2 =  make_ssa_name (vectype1);
5319           epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5320                                              build1 (VIEW_CONVERT_EXPR,
5321                                                      vectype1, tem));
5322           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5323         }
5324
5325       new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5326     }
5327
5328   return new_temp;
5329 }
5330
5331 /* Function vect_create_epilog_for_reduction
5332
5333    Create code at the loop-epilog to finalize the result of a reduction
5334    computation.
5335
5336    STMT_INFO is the scalar reduction stmt that is being vectorized.
5337    SLP_NODE is an SLP node containing a group of reduction statements. The
5338      first one in this group is STMT_INFO.
5339    SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5340    REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5341      (counting from 0)
5342
5343    This function:
5344    1. Completes the reduction def-use cycles.
5345    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5346       by calling the function specified by REDUC_FN if available, or by
5347       other means (whole-vector shifts or a scalar loop).
5348       The function also creates a new phi node at the loop exit to preserve
5349       loop-closed form, as illustrated below.
5350
5351      The flow at the entry to this function:
5352
5353         loop:
5354           vec_def = phi <vec_init, null>        # REDUCTION_PHI
5355           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5356           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5357         loop_exit:
5358           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5359           use <s_out0>
5360           use <s_out0>
5361
5362      The above is transformed by this function into:
5363
5364         loop:
5365           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
5366           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5367           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5368         loop_exit:
5369           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5370           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5371           v_out2 = reduce <v_out1>
5372           s_out3 = extract_field <v_out2, 0>
5373           s_out4 = adjust_result <s_out3>
5374           use <s_out4>
5375           use <s_out4>
5376 */
5377
5378 static void
5379 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5380                                   stmt_vec_info stmt_info,
5381                                   slp_tree slp_node,
5382                                   slp_instance slp_node_instance)
5383 {
5384   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5385   gcc_assert (reduc_info->is_reduc_info);
5386   /* For double reductions we need to get at the inner loop reduction
5387      stmt which has the meta info attached.  Our stmt_info is that of the
5388      loop-closed PHI of the inner loop which we remember as
5389      def for the reduction PHI generation.  */
5390   bool double_reduc = false;
5391   stmt_vec_info rdef_info = stmt_info;
5392   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5393     {
5394       gcc_assert (!slp_node);
5395       double_reduc = true;
5396       stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5397                                             (stmt_info->stmt, 0));
5398       stmt_info = vect_stmt_to_vectorize (stmt_info);
5399     }
5400   gphi *reduc_def_stmt
5401     = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
5402   code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5403   internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
5404   tree vectype;
5405   machine_mode mode;
5406   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
5407   basic_block exit_bb;
5408   tree scalar_dest;
5409   tree scalar_type;
5410   gimple *new_phi = NULL, *phi;
5411   gimple_stmt_iterator exit_gsi;
5412   tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5413   gimple *epilog_stmt = NULL;
5414   gimple *exit_phi;
5415   tree bitsize;
5416   tree def;
5417   tree orig_name, scalar_result;
5418   imm_use_iterator imm_iter, phi_imm_iter;
5419   use_operand_p use_p, phi_use_p;
5420   gimple *use_stmt;
5421   auto_vec<tree> reduc_inputs;
5422   int j, i;
5423   vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
5424   unsigned int group_size = 1, k;
5425   auto_vec<gimple *> phis;
5426   /* SLP reduction without reduction chain, e.g.,
5427      # a1 = phi <a2, a0>
5428      # b1 = phi <b2, b0>
5429      a2 = operation (a1)
5430      b2 = operation (b1)  */
5431   bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5432   bool direct_slp_reduc;
5433   tree induction_index = NULL_TREE;
5434
5435   if (slp_node)
5436     group_size = SLP_TREE_LANES (slp_node);
5437
5438   if (nested_in_vect_loop_p (loop, stmt_info))
5439     {
5440       outer_loop = loop;
5441       loop = loop->inner;
5442       gcc_assert (!slp_node && double_reduc);
5443     }
5444
5445   vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
5446   gcc_assert (vectype);
5447   mode = TYPE_MODE (vectype);
5448
5449   tree induc_val = NULL_TREE;
5450   tree adjustment_def = NULL;
5451   if (slp_node)
5452     ;
5453   else
5454     {
5455       /* Optimize: for induction condition reduction, if we can't use zero
5456          for induc_val, use initial_def.  */
5457       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5458         induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
5459       else if (double_reduc)
5460         ;
5461       else
5462         adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
5463     }
5464
5465   stmt_vec_info single_live_out_stmt[] = { stmt_info };
5466   array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
5467   if (slp_reduc)
5468     /* All statements produce live-out values.  */
5469     live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
5470   else if (slp_node)
5471     {
5472       /* The last statement in the reduction chain produces the live-out
5473          value.  Note SLP optimization can shuffle scalar stmts to
5474          optimize permutations so we have to search for the last stmt.  */
5475       for (k = 0; k < group_size; ++k)
5476         if (!REDUC_GROUP_NEXT_ELEMENT (SLP_TREE_SCALAR_STMTS (slp_node)[k]))
5477           {
5478             single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5479             break;
5480           }
5481     }
5482
5483   unsigned vec_num;
5484   int ncopies;
5485   if (slp_node)
5486     {
5487       vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
5488       ncopies = 1;
5489     }
5490   else
5491     {
5492       stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
5493       vec_num = 1;
5494       ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
5495     }
5496
5497   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5498      which is updated with the current index of the loop for every match of
5499      the original loop's cond_expr (VEC_STMT).  This results in a vector
5500      containing the last time the condition passed for that vector lane.
5501      The first match will be a 1 to allow 0 to be used for non-matching
5502      indexes.  If there are no matches at all then the vector will be all
5503      zeroes.
5504
5505      PR92772: This algorithm is broken for architectures that support
5506      masked vectors, but do not provide fold_extract_last.  */
5507   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
5508     {
5509       auto_vec<std::pair<tree, bool>, 2> ccompares;
5510       stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
5511       cond_info = vect_stmt_to_vectorize (cond_info);
5512       while (cond_info != reduc_info)
5513         {
5514           if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
5515             {
5516               gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
5517               gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
5518               ccompares.safe_push
5519                 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
5520                                  STMT_VINFO_REDUC_IDX (cond_info) == 2));
5521             }
5522           cond_info
5523             = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
5524                                                  1 + STMT_VINFO_REDUC_IDX
5525                                                         (cond_info)));
5526           cond_info = vect_stmt_to_vectorize (cond_info);
5527         }
5528       gcc_assert (ccompares.length () != 0);
5529
5530       tree indx_before_incr, indx_after_incr;
5531       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
5532       int scalar_precision
5533         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
5534       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
5535       tree cr_index_vector_type = get_related_vectype_for_scalar_type
5536         (TYPE_MODE (vectype), cr_index_scalar_type,
5537          TYPE_VECTOR_SUBPARTS (vectype));
5538
5539       /* First we create a simple vector induction variable which starts
5540          with the values {1,2,3,...} (SERIES_VECT) and increments by the
5541          vector size (STEP).  */
5542
5543       /* Create a {1,2,3,...} vector.  */
5544       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
5545
5546       /* Create a vector of the step value.  */
5547       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
5548       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
5549
5550       /* Create an induction variable.  */
5551       gimple_stmt_iterator incr_gsi;
5552       bool insert_after;
5553       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5554       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
5555                  insert_after, &indx_before_incr, &indx_after_incr);
5556
5557       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
5558          filled with zeros (VEC_ZERO).  */
5559
5560       /* Create a vector of 0s.  */
5561       tree zero = build_zero_cst (cr_index_scalar_type);
5562       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
5563
5564       /* Create a vector phi node.  */
5565       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
5566       new_phi = create_phi_node (new_phi_tree, loop->header);
5567       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
5568                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
5569
5570       /* Now take the condition from the loops original cond_exprs
5571          and produce a new cond_exprs (INDEX_COND_EXPR) which for
5572          every match uses values from the induction variable
5573          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5574          (NEW_PHI_TREE).
5575          Finally, we update the phi (NEW_PHI_TREE) to take the value of
5576          the new cond_expr (INDEX_COND_EXPR).  */
5577       gimple_seq stmts = NULL;
5578       for (int i = ccompares.length () - 1; i != -1; --i)
5579         {
5580           tree ccompare = ccompares[i].first;
5581           if (ccompares[i].second)
5582             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5583                                          cr_index_vector_type,
5584                                          ccompare,
5585                                          indx_before_incr, new_phi_tree);
5586           else
5587             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5588                                          cr_index_vector_type,
5589                                          ccompare,
5590                                          new_phi_tree, indx_before_incr);
5591         }
5592       gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
5593
5594       /* Update the phi with the vec cond.  */
5595       induction_index = new_phi_tree;
5596       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
5597                    loop_latch_edge (loop), UNKNOWN_LOCATION);
5598     }
5599
5600   /* 2. Create epilog code.
5601         The reduction epilog code operates across the elements of the vector
5602         of partial results computed by the vectorized loop.
5603         The reduction epilog code consists of:
5604
5605         step 1: compute the scalar result in a vector (v_out2)
5606         step 2: extract the scalar result (s_out3) from the vector (v_out2)
5607         step 3: adjust the scalar result (s_out3) if needed.
5608
5609         Step 1 can be accomplished using one the following three schemes:
5610           (scheme 1) using reduc_fn, if available.
5611           (scheme 2) using whole-vector shifts, if available.
5612           (scheme 3) using a scalar loop. In this case steps 1+2 above are
5613                      combined.
5614
5615           The overall epilog code looks like this:
5616
5617           s_out0 = phi <s_loop>         # original EXIT_PHI
5618           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
5619           v_out2 = reduce <v_out1>              # step 1
5620           s_out3 = extract_field <v_out2, 0>    # step 2
5621           s_out4 = adjust_result <s_out3>       # step 3
5622
5623           (step 3 is optional, and steps 1 and 2 may be combined).
5624           Lastly, the uses of s_out0 are replaced by s_out4.  */
5625
5626
5627   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5628          v_out1 = phi <VECT_DEF>
5629          Store them in NEW_PHIS.  */
5630   if (double_reduc)
5631     loop = outer_loop;
5632   exit_bb = single_exit (loop)->dest;
5633   exit_gsi = gsi_after_labels (exit_bb);
5634   reduc_inputs.create (slp_node ? vec_num : ncopies);
5635   for (unsigned i = 0; i < vec_num; i++)
5636     {
5637       gimple_seq stmts = NULL;
5638       if (slp_node)
5639         def = vect_get_slp_vect_def (slp_node, i);
5640       else
5641         def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
5642       for (j = 0; j < ncopies; j++)
5643         {
5644           tree new_def = copy_ssa_name (def);
5645           phi = create_phi_node (new_def, exit_bb);
5646           if (j)
5647             def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
5648           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
5649           new_def = gimple_convert (&stmts, vectype, new_def);
5650           reduc_inputs.quick_push (new_def);
5651         }
5652       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5653     }
5654
5655   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
5656          (i.e. when reduc_fn is not available) and in the final adjustment
5657          code (if needed).  Also get the original scalar reduction variable as
5658          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
5659          represents a reduction pattern), the tree-code and scalar-def are
5660          taken from the original stmt that the pattern-stmt (STMT) replaces.
5661          Otherwise (it is a regular reduction) - the tree-code and scalar-def
5662          are taken from STMT.  */
5663
5664   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5665   if (orig_stmt_info != stmt_info)
5666     {
5667       /* Reduction pattern  */
5668       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5669       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
5670     }
5671
5672   scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
5673   scalar_type = TREE_TYPE (scalar_dest);
5674   scalar_results.truncate (0);
5675   scalar_results.reserve_exact (group_size);
5676   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5677   bitsize = TYPE_SIZE (scalar_type);
5678
5679   /* True if we should implement SLP_REDUC using native reduction operations
5680      instead of scalar operations.  */
5681   direct_slp_reduc = (reduc_fn != IFN_LAST
5682                       && slp_reduc
5683                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5684
5685   /* In case of reduction chain, e.g.,
5686      # a1 = phi <a3, a0>
5687      a2 = operation (a1)
5688      a3 = operation (a2),
5689
5690      we may end up with more than one vector result.  Here we reduce them
5691      to one vector.
5692
5693      The same is true for a SLP reduction, e.g.,
5694      # a1 = phi <a2, a0>
5695      # b1 = phi <b2, b0>
5696      a2 = operation (a1)
5697      b2 = operation (a2),
5698
5699      where we can end up with more than one vector as well.  We can
5700      easily accumulate vectors when the number of vector elements is
5701      a multiple of the SLP group size.
5702
5703      The same is true if we couldn't use a single defuse cycle.  */
5704   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
5705       || direct_slp_reduc
5706       || (slp_reduc
5707           && constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype), group_size))
5708       || ncopies > 1)
5709     {
5710       gimple_seq stmts = NULL;
5711       tree single_input = reduc_inputs[0];
5712       for (k = 1; k < reduc_inputs.length (); k++)
5713         single_input = gimple_build (&stmts, code, vectype,
5714                                      single_input, reduc_inputs[k]);
5715       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5716
5717       reduc_inputs.truncate (0);
5718       reduc_inputs.safe_push (single_input);
5719     }
5720
5721   tree orig_reduc_input = reduc_inputs[0];
5722
5723   /* If this loop is an epilogue loop that can be skipped after the
5724      main loop, we can only share a reduction operation between the
5725      main loop and the epilogue if we put it at the target of the
5726      skip edge.
5727
5728      We can still reuse accumulators if this check fails.  Doing so has
5729      the minor(?) benefit of making the epilogue loop's scalar result
5730      independent of the main loop's scalar result.  */
5731   bool unify_with_main_loop_p = false;
5732   if (reduc_info->reused_accumulator
5733       && loop_vinfo->skip_this_loop_edge
5734       && single_succ_p (exit_bb)
5735       && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
5736     {
5737       unify_with_main_loop_p = true;
5738
5739       basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
5740       reduc_inputs[0] = make_ssa_name (vectype);
5741       gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
5742       add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
5743                    UNKNOWN_LOCATION);
5744       add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
5745                    loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
5746       exit_gsi = gsi_after_labels (reduc_block);
5747     }
5748
5749   /* Shouldn't be used beyond this point.  */
5750   exit_bb = nullptr;
5751
5752   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5753       && reduc_fn != IFN_LAST)
5754     {
5755       /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
5756          various data values where the condition matched and another vector
5757          (INDUCTION_INDEX) containing all the indexes of those matches.  We
5758          need to extract the last matching index (which will be the index with
5759          highest value) and use this to index into the data vector.
5760          For the case where there were no matches, the data vector will contain
5761          all default values and the index vector will be all zeros.  */
5762
5763       /* Get various versions of the type of the vector of indexes.  */
5764       tree index_vec_type = TREE_TYPE (induction_index);
5765       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5766       tree index_scalar_type = TREE_TYPE (index_vec_type);
5767       tree index_vec_cmp_type = truth_type_for (index_vec_type);
5768
5769       /* Get an unsigned integer version of the type of the data vector.  */
5770       int scalar_precision
5771         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5772       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5773       tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
5774                                                 vectype);
5775
5776       /* First we need to create a vector (ZERO_VEC) of zeros and another
5777          vector (MAX_INDEX_VEC) filled with the last matching index, which we
5778          can create using a MAX reduction and then expanding.
5779          In the case where the loop never made any matches, the max index will
5780          be zero.  */
5781
5782       /* Vector of {0, 0, 0,...}.  */
5783       tree zero_vec = build_zero_cst (vectype);
5784
5785       /* Find maximum value from the vector of found indexes.  */
5786       tree max_index = make_ssa_name (index_scalar_type);
5787       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5788                                                           1, induction_index);
5789       gimple_call_set_lhs (max_index_stmt, max_index);
5790       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5791
5792       /* Vector of {max_index, max_index, max_index,...}.  */
5793       tree max_index_vec = make_ssa_name (index_vec_type);
5794       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5795                                                       max_index);
5796       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5797                                                         max_index_vec_rhs);
5798       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5799
5800       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5801          with the vector (INDUCTION_INDEX) of found indexes, choosing values
5802          from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
5803          otherwise.  Only one value should match, resulting in a vector
5804          (VEC_COND) with one data value and the rest zeros.
5805          In the case where the loop never made any matches, every index will
5806          match, resulting in a vector with all data values (which will all be
5807          the default value).  */
5808
5809       /* Compare the max index vector to the vector of found indexes to find
5810          the position of the max value.  */
5811       tree vec_compare = make_ssa_name (index_vec_cmp_type);
5812       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5813                                                       induction_index,
5814                                                       max_index_vec);
5815       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5816
5817       /* Use the compare to choose either values from the data vector or
5818          zero.  */
5819       tree vec_cond = make_ssa_name (vectype);
5820       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5821                                                    vec_compare,
5822                                                    reduc_inputs[0],
5823                                                    zero_vec);
5824       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5825
5826       /* Finally we need to extract the data value from the vector (VEC_COND)
5827          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
5828          reduction, but because this doesn't exist, we can use a MAX reduction
5829          instead.  The data value might be signed or a float so we need to cast
5830          it first.
5831          In the case where the loop never made any matches, the data values are
5832          all identical, and so will reduce down correctly.  */
5833
5834       /* Make the matched data values unsigned.  */
5835       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5836       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5837                                        vec_cond);
5838       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5839                                                         VIEW_CONVERT_EXPR,
5840                                                         vec_cond_cast_rhs);
5841       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5842
5843       /* Reduce down to a scalar value.  */
5844       tree data_reduc = make_ssa_name (scalar_type_unsigned);
5845       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5846                                                            1, vec_cond_cast);
5847       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5848       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5849
5850       /* Convert the reduced value back to the result type and set as the
5851          result.  */
5852       gimple_seq stmts = NULL;
5853       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5854                                data_reduc);
5855       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5856       scalar_results.safe_push (new_temp);
5857     }
5858   else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5859            && reduc_fn == IFN_LAST)
5860     {
5861       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
5862          idx = 0;
5863          idx_val = induction_index[0];
5864          val = data_reduc[0];
5865          for (idx = 0, val = init, i = 0; i < nelts; ++i)
5866            if (induction_index[i] > idx_val)
5867              val = data_reduc[i], idx_val = induction_index[i];
5868          return val;  */
5869
5870       tree data_eltype = TREE_TYPE (vectype);
5871       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5872       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5873       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5874       /* Enforced by vectorizable_reduction, which ensures we have target
5875          support before allowing a conditional reduction on variable-length
5876          vectors.  */
5877       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5878       tree idx_val = NULL_TREE, val = NULL_TREE;
5879       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5880         {
5881           tree old_idx_val = idx_val;
5882           tree old_val = val;
5883           idx_val = make_ssa_name (idx_eltype);
5884           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5885                                              build3 (BIT_FIELD_REF, idx_eltype,
5886                                                      induction_index,
5887                                                      bitsize_int (el_size),
5888                                                      bitsize_int (off)));
5889           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5890           val = make_ssa_name (data_eltype);
5891           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5892                                              build3 (BIT_FIELD_REF,
5893                                                      data_eltype,
5894                                                      reduc_inputs[0],
5895                                                      bitsize_int (el_size),
5896                                                      bitsize_int (off)));
5897           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5898           if (off != 0)
5899             {
5900               tree new_idx_val = idx_val;
5901               if (off != v_size - el_size)
5902                 {
5903                   new_idx_val = make_ssa_name (idx_eltype);
5904                   epilog_stmt = gimple_build_assign (new_idx_val,
5905                                                      MAX_EXPR, idx_val,
5906                                                      old_idx_val);
5907                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5908                 }
5909               tree cond = make_ssa_name (boolean_type_node);
5910               epilog_stmt = gimple_build_assign (cond, GT_EXPR,
5911                                                  idx_val, old_idx_val);
5912               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5913               tree new_val = make_ssa_name (data_eltype);
5914               epilog_stmt = gimple_build_assign (new_val, COND_EXPR,
5915                                                  cond, val, old_val);
5916               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5917               idx_val = new_idx_val;
5918               val = new_val;
5919             }
5920         }
5921       /* Convert the reduced value back to the result type and set as the
5922          result.  */
5923       gimple_seq stmts = NULL;
5924       val = gimple_convert (&stmts, scalar_type, val);
5925       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5926       scalar_results.safe_push (val);
5927     }
5928
5929   /* 2.3 Create the reduction code, using one of the three schemes described
5930          above. In SLP we simply need to extract all the elements from the
5931          vector (without reducing them), so we use scalar shifts.  */
5932   else if (reduc_fn != IFN_LAST && !slp_reduc)
5933     {
5934       tree tmp;
5935       tree vec_elem_type;
5936
5937       /* Case 1:  Create:
5938          v_out2 = reduc_expr <v_out1>  */
5939
5940       if (dump_enabled_p ())
5941         dump_printf_loc (MSG_NOTE, vect_location,
5942                          "Reduce using direct vector reduction.\n");
5943
5944       gimple_seq stmts = NULL;
5945       vec_elem_type = TREE_TYPE (vectype);
5946       new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
5947                                vec_elem_type, reduc_inputs[0]);
5948       new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5949       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5950
5951       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5952           && induc_val)
5953         {
5954           /* Earlier we set the initial value to be a vector if induc_val
5955              values.  Check the result and if it is induc_val then replace
5956              with the original initial value, unless induc_val is
5957              the same as initial_def already.  */
5958           tree zcompare = make_ssa_name (boolean_type_node);
5959           epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
5960                                              new_temp, induc_val);
5961           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5962           tree initial_def = reduc_info->reduc_initial_values[0];
5963           tmp = make_ssa_name (new_scalar_dest);
5964           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5965                                              initial_def, new_temp);
5966           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5967           new_temp = tmp;
5968         }
5969
5970       scalar_results.safe_push (new_temp);
5971     }
5972   else if (direct_slp_reduc)
5973     {
5974       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5975          with the elements for other SLP statements replaced with the
5976          neutral value.  We can then do a normal reduction on each vector.  */
5977
5978       /* Enforced by vectorizable_reduction.  */
5979       gcc_assert (reduc_inputs.length () == 1);
5980       gcc_assert (pow2p_hwi (group_size));
5981
5982       gimple_seq seq = NULL;
5983
5984       /* Build a vector {0, 1, 2, ...}, with the same number of elements
5985          and the same element size as VECTYPE.  */
5986       tree index = build_index_vector (vectype, 0, 1);
5987       tree index_type = TREE_TYPE (index);
5988       tree index_elt_type = TREE_TYPE (index_type);
5989       tree mask_type = truth_type_for (index_type);
5990
5991       /* Create a vector that, for each element, identifies which of
5992          the REDUC_GROUP_SIZE results should use it.  */
5993       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5994       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5995                             build_vector_from_val (index_type, index_mask));
5996
5997       /* Get a neutral vector value.  This is simply a splat of the neutral
5998          scalar value if we have one, otherwise the initial scalar value
5999          is itself a neutral value.  */
6000       tree vector_identity = NULL_TREE;
6001       tree neutral_op = NULL_TREE;
6002       if (slp_node)
6003         {
6004           tree initial_value = NULL_TREE;
6005           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6006             initial_value = reduc_info->reduc_initial_values[0];
6007           neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
6008                                                  initial_value);
6009         }
6010       if (neutral_op)
6011         vector_identity = gimple_build_vector_from_val (&seq, vectype,
6012                                                         neutral_op);
6013       for (unsigned int i = 0; i < group_size; ++i)
6014         {
6015           /* If there's no univeral neutral value, we can use the
6016              initial scalar value from the original PHI.  This is used
6017              for MIN and MAX reduction, for example.  */
6018           if (!neutral_op)
6019             {
6020               tree scalar_value = reduc_info->reduc_initial_values[i];
6021               scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
6022                                              scalar_value);
6023               vector_identity = gimple_build_vector_from_val (&seq, vectype,
6024                                                               scalar_value);
6025             }
6026
6027           /* Calculate the equivalent of:
6028
6029              sel[j] = (index[j] == i);
6030
6031              which selects the elements of REDUC_INPUTS[0] that should
6032              be included in the result.  */
6033           tree compare_val = build_int_cst (index_elt_type, i);
6034           compare_val = build_vector_from_val (index_type, compare_val);
6035           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
6036                                    index, compare_val);
6037
6038           /* Calculate the equivalent of:
6039
6040              vec = seq ? reduc_inputs[0] : vector_identity;
6041
6042              VEC is now suitable for a full vector reduction.  */
6043           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
6044                                    sel, reduc_inputs[0], vector_identity);
6045
6046           /* Do the reduction and convert it to the appropriate type.  */
6047           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
6048                                       TREE_TYPE (vectype), vec);
6049           scalar = gimple_convert (&seq, scalar_type, scalar);
6050           scalar_results.safe_push (scalar);
6051         }
6052       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
6053     }
6054   else
6055     {
6056       bool reduce_with_shift;
6057       tree vec_temp;
6058
6059       gcc_assert (slp_reduc || reduc_inputs.length () == 1);
6060
6061       /* See if the target wants to do the final (shift) reduction
6062          in a vector mode of smaller size and first reduce upper/lower
6063          halves against each other.  */
6064       enum machine_mode mode1 = mode;
6065       tree stype = TREE_TYPE (vectype);
6066       unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6067       unsigned nunits1 = nunits;
6068       if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
6069           && reduc_inputs.length () == 1)
6070         {
6071           nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6072           /* For SLP reductions we have to make sure lanes match up, but
6073              since we're doing individual element final reduction reducing
6074              vector width here is even more important.
6075              ???  We can also separate lanes with permutes, for the common
6076              case of power-of-two group-size odd/even extracts would work.  */
6077           if (slp_reduc && nunits != nunits1)
6078             {
6079               nunits1 = least_common_multiple (nunits1, group_size);
6080               gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
6081             }
6082         }
6083       if (!slp_reduc
6084           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
6085         nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6086
6087       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
6088                                                            stype, nunits1);
6089       reduce_with_shift = have_whole_vector_shift (mode1);
6090       if (!VECTOR_MODE_P (mode1)
6091           || !directly_supported_p (code, vectype1))
6092         reduce_with_shift = false;
6093
6094       /* First reduce the vector to the desired vector size we should
6095          do shift reduction on by combining upper and lower halves.  */
6096       gimple_seq stmts = NULL;
6097       new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
6098                                              code, &stmts);
6099       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6100       reduc_inputs[0] = new_temp;
6101
6102       if (reduce_with_shift && !slp_reduc)
6103         {
6104           int element_bitsize = tree_to_uhwi (bitsize);
6105           /* Enforced by vectorizable_reduction, which disallows SLP reductions
6106              for variable-length vectors and also requires direct target support
6107              for loop reductions.  */
6108           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6109           int nelements = vec_size_in_bits / element_bitsize;
6110           vec_perm_builder sel;
6111           vec_perm_indices indices;
6112
6113           int elt_offset;
6114
6115           tree zero_vec = build_zero_cst (vectype1);
6116           /* Case 2: Create:
6117              for (offset = nelements/2; offset >= 1; offset/=2)
6118                 {
6119                   Create:  va' = vec_shift <va, offset>
6120                   Create:  va = vop <va, va'>
6121                 }  */
6122
6123           tree rhs;
6124
6125           if (dump_enabled_p ())
6126             dump_printf_loc (MSG_NOTE, vect_location,
6127                              "Reduce using vector shifts\n");
6128
6129           gimple_seq stmts = NULL;
6130           new_temp = gimple_convert (&stmts, vectype1, new_temp);
6131           for (elt_offset = nelements / 2;
6132                elt_offset >= 1;
6133                elt_offset /= 2)
6134             {
6135               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
6136               indices.new_vector (sel, 2, nelements);
6137               tree mask = vect_gen_perm_mask_any (vectype1, indices);
6138               new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
6139                                        new_temp, zero_vec, mask);
6140               new_temp = gimple_build (&stmts, code,
6141                                        vectype1, new_name, new_temp);
6142             }
6143           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6144
6145           /* 2.4  Extract the final scalar result.  Create:
6146              s_out3 = extract_field <v_out2, bitpos>  */
6147
6148           if (dump_enabled_p ())
6149             dump_printf_loc (MSG_NOTE, vect_location,
6150                              "extract scalar result\n");
6151
6152           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
6153                         bitsize, bitsize_zero_node);
6154           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
6155           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
6156           gimple_assign_set_lhs (epilog_stmt, new_temp);
6157           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6158           scalar_results.safe_push (new_temp);
6159         }
6160       else
6161         {
6162           /* Case 3: Create:
6163              s = extract_field <v_out2, 0>
6164              for (offset = element_size;
6165                   offset < vector_size;
6166                   offset += element_size;)
6167                {
6168                  Create:  s' = extract_field <v_out2, offset>
6169                  Create:  s = op <s, s'>  // For non SLP cases
6170                }  */
6171
6172           if (dump_enabled_p ())
6173             dump_printf_loc (MSG_NOTE, vect_location,
6174                              "Reduce using scalar code.\n");
6175
6176           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6177           int element_bitsize = tree_to_uhwi (bitsize);
6178           tree compute_type = TREE_TYPE (vectype);
6179           gimple_seq stmts = NULL;
6180           FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
6181             {
6182               int bit_offset;
6183               new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
6184                                        vec_temp, bitsize, bitsize_zero_node);
6185
6186               /* In SLP we don't need to apply reduction operation, so we just
6187                  collect s' values in SCALAR_RESULTS.  */
6188               if (slp_reduc)
6189                 scalar_results.safe_push (new_temp);
6190
6191               for (bit_offset = element_bitsize;
6192                    bit_offset < vec_size_in_bits;
6193                    bit_offset += element_bitsize)
6194                 {
6195                   tree bitpos = bitsize_int (bit_offset);
6196                   new_name = gimple_build (&stmts, BIT_FIELD_REF,
6197                                            compute_type, vec_temp,
6198                                            bitsize, bitpos);
6199                   if (slp_reduc)
6200                     {
6201                       /* In SLP we don't need to apply reduction operation, so
6202                          we just collect s' values in SCALAR_RESULTS.  */
6203                       new_temp = new_name;
6204                       scalar_results.safe_push (new_name);
6205                     }
6206                   else
6207                     new_temp = gimple_build (&stmts, code, compute_type,
6208                                              new_name, new_temp);
6209                 }
6210             }
6211
6212           /* The only case where we need to reduce scalar results in SLP, is
6213              unrolling.  If the size of SCALAR_RESULTS is greater than
6214              REDUC_GROUP_SIZE, we reduce them combining elements modulo
6215              REDUC_GROUP_SIZE.  */
6216           if (slp_reduc)
6217             {
6218               tree res, first_res, new_res;
6219
6220               /* Reduce multiple scalar results in case of SLP unrolling.  */
6221               for (j = group_size; scalar_results.iterate (j, &res);
6222                    j++)
6223                 {
6224                   first_res = scalar_results[j % group_size];
6225                   new_res = gimple_build (&stmts, code, compute_type,
6226                                           first_res, res);
6227                   scalar_results[j % group_size] = new_res;
6228                 }
6229               scalar_results.truncate (group_size);
6230               for (k = 0; k < group_size; k++)
6231                 scalar_results[k] = gimple_convert (&stmts, scalar_type,
6232                                                     scalar_results[k]);
6233             }
6234           else
6235             {
6236               /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
6237               new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6238               scalar_results.safe_push (new_temp);
6239             }
6240
6241           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6242         }
6243
6244       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6245           && induc_val)
6246         {
6247           /* Earlier we set the initial value to be a vector if induc_val
6248              values.  Check the result and if it is induc_val then replace
6249              with the original initial value, unless induc_val is
6250              the same as initial_def already.  */
6251           tree zcompare = make_ssa_name (boolean_type_node);
6252           epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR, new_temp,
6253                                              induc_val);
6254           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6255           tree initial_def = reduc_info->reduc_initial_values[0];
6256           tree tmp = make_ssa_name (new_scalar_dest);
6257           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6258                                              initial_def, new_temp);
6259           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6260           scalar_results[0] = tmp;
6261         }
6262     }
6263
6264   /* 2.5 Adjust the final result by the initial value of the reduction
6265          variable. (When such adjustment is not needed, then
6266          'adjustment_def' is zero).  For example, if code is PLUS we create:
6267          new_temp = loop_exit_def + adjustment_def  */
6268
6269   if (adjustment_def)
6270     {
6271       gcc_assert (!slp_reduc);
6272       gimple_seq stmts = NULL;
6273       if (double_reduc)
6274         {
6275           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6276           adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6277           new_temp = gimple_build (&stmts, code, vectype,
6278                                    reduc_inputs[0], adjustment_def);
6279         }
6280       else
6281         {
6282           new_temp = scalar_results[0];
6283           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6284           adjustment_def = gimple_convert (&stmts, scalar_type, adjustment_def);
6285           new_temp = gimple_build (&stmts, code, scalar_type,
6286                                    new_temp, adjustment_def);
6287         }
6288
6289       epilog_stmt = gimple_seq_last_stmt (stmts);
6290       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6291       scalar_results[0] = new_temp;
6292     }
6293
6294   /* Record this operation if it could be reused by the epilogue loop.  */
6295   if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION
6296       && reduc_inputs.length () == 1)
6297     loop_vinfo->reusable_accumulators.put (scalar_results[0],
6298                                            { orig_reduc_input, reduc_info });
6299
6300   if (double_reduc)
6301     loop = outer_loop;
6302
6303   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
6304           phis with new adjusted scalar results, i.e., replace use <s_out0>
6305           with use <s_out4>.
6306
6307      Transform:
6308         loop_exit:
6309           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6310           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6311           v_out2 = reduce <v_out1>
6312           s_out3 = extract_field <v_out2, 0>
6313           s_out4 = adjust_result <s_out3>
6314           use <s_out0>
6315           use <s_out0>
6316
6317      into:
6318
6319         loop_exit:
6320           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6321           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6322           v_out2 = reduce <v_out1>
6323           s_out3 = extract_field <v_out2, 0>
6324           s_out4 = adjust_result <s_out3>
6325           use <s_out4>
6326           use <s_out4> */
6327
6328   gcc_assert (live_out_stmts.size () == scalar_results.length ());
6329   for (k = 0; k < live_out_stmts.size (); k++)
6330     {
6331       stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6332       scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6333
6334       phis.create (3);
6335       /* Find the loop-closed-use at the loop exit of the original scalar
6336          result.  (The reduction result is expected to have two immediate uses,
6337          one at the latch block, and one at the loop exit).  For double
6338          reductions we are looking for exit phis of the outer loop.  */
6339       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6340         {
6341           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6342             {
6343               if (!is_gimple_debug (USE_STMT (use_p)))
6344                 phis.safe_push (USE_STMT (use_p));
6345             }
6346           else
6347             {
6348               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6349                 {
6350                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
6351
6352                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6353                     {
6354                       if (!flow_bb_inside_loop_p (loop,
6355                                              gimple_bb (USE_STMT (phi_use_p)))
6356                           && !is_gimple_debug (USE_STMT (phi_use_p)))
6357                         phis.safe_push (USE_STMT (phi_use_p));
6358                     }
6359                 }
6360             }
6361         }
6362
6363       FOR_EACH_VEC_ELT (phis, i, exit_phi)
6364         {
6365           /* Replace the uses:  */
6366           orig_name = PHI_RESULT (exit_phi);
6367
6368           /* Look for a single use at the target of the skip edge.  */
6369           if (unify_with_main_loop_p)
6370             {
6371               use_operand_p use_p;
6372               gimple *user;
6373               if (!single_imm_use (orig_name, &use_p, &user))
6374                 gcc_unreachable ();
6375               orig_name = gimple_get_lhs (user);
6376             }
6377
6378           scalar_result = scalar_results[k];
6379           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6380             {
6381               FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6382                 SET_USE (use_p, scalar_result);
6383               update_stmt (use_stmt);
6384             }
6385         }
6386
6387       phis.release ();
6388     }
6389 }
6390
6391 /* Return a vector of type VECTYPE that is equal to the vector select
6392    operation "MASK ? VEC : IDENTITY".  Insert the select statements
6393    before GSI.  */
6394
6395 static tree
6396 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6397                      tree vec, tree identity)
6398 {
6399   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6400   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6401                                           mask, vec, identity);
6402   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6403   return cond;
6404 }
6405
6406 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6407    order, starting with LHS.  Insert the extraction statements before GSI and
6408    associate the new scalar SSA names with variable SCALAR_DEST.
6409    Return the SSA name for the result.  */
6410
6411 static tree
6412 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6413                        tree_code code, tree lhs, tree vector_rhs)
6414 {
6415   tree vectype = TREE_TYPE (vector_rhs);
6416   tree scalar_type = TREE_TYPE (vectype);
6417   tree bitsize = TYPE_SIZE (scalar_type);
6418   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6419   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6420
6421   for (unsigned HOST_WIDE_INT bit_offset = 0;
6422        bit_offset < vec_size_in_bits;
6423        bit_offset += element_bitsize)
6424     {
6425       tree bitpos = bitsize_int (bit_offset);
6426       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6427                          bitsize, bitpos);
6428
6429       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6430       rhs = make_ssa_name (scalar_dest, stmt);
6431       gimple_assign_set_lhs (stmt, rhs);
6432       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6433
6434       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6435       tree new_name = make_ssa_name (scalar_dest, stmt);
6436       gimple_assign_set_lhs (stmt, new_name);
6437       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6438       lhs = new_name;
6439     }
6440   return lhs;
6441 }
6442
6443 /* Get a masked internal function equivalent to REDUC_FN.  VECTYPE_IN is the
6444    type of the vector input.  */
6445
6446 static internal_fn
6447 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6448 {
6449   internal_fn mask_reduc_fn;
6450
6451   switch (reduc_fn)
6452     {
6453     case IFN_FOLD_LEFT_PLUS:
6454       mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6455       break;
6456
6457     default:
6458       return IFN_LAST;
6459     }
6460
6461   if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
6462                                       OPTIMIZE_FOR_SPEED))
6463     return mask_reduc_fn;
6464   return IFN_LAST;
6465 }
6466
6467 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
6468    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
6469    statement.  CODE is the operation performed by STMT_INFO and OPS are
6470    its scalar operands.  REDUC_INDEX is the index of the operand in
6471    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
6472    implements in-order reduction, or IFN_LAST if we should open-code it.
6473    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
6474    that should be used to control the operation in a fully-masked loop.  */
6475
6476 static bool
6477 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
6478                                stmt_vec_info stmt_info,
6479                                gimple_stmt_iterator *gsi,
6480                                gimple **vec_stmt, slp_tree slp_node,
6481                                gimple *reduc_def_stmt,
6482                                tree_code code, internal_fn reduc_fn,
6483                                tree ops[3], tree vectype_in,
6484                                int reduc_index, vec_loop_masks *masks)
6485 {
6486   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6487   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6488   internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
6489
6490   int ncopies;
6491   if (slp_node)
6492     ncopies = 1;
6493   else
6494     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6495
6496   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
6497   gcc_assert (ncopies == 1);
6498   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6499
6500   if (slp_node)
6501     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6502                           TYPE_VECTOR_SUBPARTS (vectype_in)));
6503
6504   tree op0 = ops[1 - reduc_index];
6505
6506   int group_size = 1;
6507   stmt_vec_info scalar_dest_def_info;
6508   auto_vec<tree> vec_oprnds0;
6509   if (slp_node)
6510     {
6511       auto_vec<vec<tree> > vec_defs (2);
6512       vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
6513       vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
6514       vec_defs[0].release ();
6515       vec_defs[1].release ();
6516       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6517       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6518     }
6519   else
6520     {
6521       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
6522                                      op0, &vec_oprnds0);
6523       scalar_dest_def_info = stmt_info;
6524     }
6525
6526   tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
6527   tree scalar_type = TREE_TYPE (scalar_dest);
6528   tree reduc_var = gimple_phi_result (reduc_def_stmt);
6529
6530   int vec_num = vec_oprnds0.length ();
6531   gcc_assert (vec_num == 1 || slp_node);
6532   tree vec_elem_type = TREE_TYPE (vectype_out);
6533   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6534
6535   tree vector_identity = NULL_TREE;
6536   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6537     vector_identity = build_zero_cst (vectype_out);
6538
6539   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6540   int i;
6541   tree def0;
6542   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6543     {
6544       gimple *new_stmt;
6545       tree mask = NULL_TREE;
6546       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6547         mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
6548
6549       /* Handle MINUS by adding the negative.  */
6550       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6551         {
6552           tree negated = make_ssa_name (vectype_out);
6553           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6554           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6555           def0 = negated;
6556         }
6557
6558       if (mask && mask_reduc_fn == IFN_LAST)
6559         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6560                                     vector_identity);
6561
6562       /* On the first iteration the input is simply the scalar phi
6563          result, and for subsequent iterations it is the output of
6564          the preceding operation.  */
6565       if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6566         {
6567           if (mask && mask_reduc_fn != IFN_LAST)
6568             new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6569                                                    def0, mask);
6570           else
6571             new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6572                                                    def0);
6573           /* For chained SLP reductions the output of the previous reduction
6574              operation serves as the input of the next. For the final statement
6575              the output cannot be a temporary - we reuse the original
6576              scalar destination of the last statement.  */
6577           if (i != vec_num - 1)
6578             {
6579               gimple_set_lhs (new_stmt, scalar_dest_var);
6580               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6581               gimple_set_lhs (new_stmt, reduc_var);
6582             }
6583         }
6584       else
6585         {
6586           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6587                                              reduc_var, def0);
6588           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6589           /* Remove the statement, so that we can use the same code paths
6590              as for statements that we've just created.  */
6591           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6592           gsi_remove (&tmp_gsi, true);
6593         }
6594
6595       if (i == vec_num - 1)
6596         {
6597           gimple_set_lhs (new_stmt, scalar_dest);
6598           vect_finish_replace_stmt (loop_vinfo,
6599                                     scalar_dest_def_info,
6600                                     new_stmt);
6601         }
6602       else
6603         vect_finish_stmt_generation (loop_vinfo,
6604                                      scalar_dest_def_info,
6605                                      new_stmt, gsi);
6606
6607       if (slp_node)
6608         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6609       else
6610         {
6611           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
6612           *vec_stmt = new_stmt;
6613         }
6614     }
6615
6616   return true;
6617 }
6618
6619 /* Function is_nonwrapping_integer_induction.
6620
6621    Check if STMT_VINO (which is part of loop LOOP) both increments and
6622    does not cause overflow.  */
6623
6624 static bool
6625 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
6626 {
6627   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
6628   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6629   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6630   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
6631   widest_int ni, max_loop_value, lhs_max;
6632   wi::overflow_type overflow = wi::OVF_NONE;
6633
6634   /* Make sure the loop is integer based.  */
6635   if (TREE_CODE (base) != INTEGER_CST
6636       || TREE_CODE (step) != INTEGER_CST)
6637     return false;
6638
6639   /* Check that the max size of the loop will not wrap.  */
6640
6641   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6642     return true;
6643
6644   if (! max_stmt_executions (loop, &ni))
6645     return false;
6646
6647   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6648                             &overflow);
6649   if (overflow)
6650     return false;
6651
6652   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6653                             TYPE_SIGN (lhs_type), &overflow);
6654   if (overflow)
6655     return false;
6656
6657   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6658           <= TYPE_PRECISION (lhs_type));
6659 }
6660
6661 /* Check if masking can be supported by inserting a conditional expression.
6662    CODE is the code for the operation.  COND_FN is the conditional internal
6663    function, if it exists.  VECTYPE_IN is the type of the vector input.  */
6664 static bool
6665 use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
6666                          tree vectype_in)
6667 {
6668   if (cond_fn != IFN_LAST
6669       && direct_internal_fn_supported_p (cond_fn, vectype_in,
6670                                          OPTIMIZE_FOR_SPEED))
6671     return false;
6672
6673   if (code.is_tree_code ())
6674     switch (tree_code (code))
6675       {
6676       case DOT_PROD_EXPR:
6677       case SAD_EXPR:
6678         return true;
6679
6680       default:
6681         break;
6682       }
6683   return false;
6684 }
6685
6686 /* Insert a conditional expression to enable masked vectorization.  CODE is the
6687    code for the operation.  VOP is the array of operands.  MASK is the loop
6688    mask.  GSI is a statement iterator used to place the new conditional
6689    expression.  */
6690 static void
6691 build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
6692                       gimple_stmt_iterator *gsi)
6693 {
6694   switch (tree_code (code))
6695     {
6696     case DOT_PROD_EXPR:
6697       {
6698         tree vectype = TREE_TYPE (vop[1]);
6699         tree zero = build_zero_cst (vectype);
6700         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6701         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6702                                                mask, vop[1], zero);
6703         gsi_insert_before (gsi, select, GSI_SAME_STMT);
6704         vop[1] = masked_op1;
6705         break;
6706       }
6707
6708     case SAD_EXPR:
6709       {
6710         tree vectype = TREE_TYPE (vop[1]);
6711         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6712         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6713                                                mask, vop[1], vop[0]);
6714         gsi_insert_before (gsi, select, GSI_SAME_STMT);
6715         vop[1] = masked_op1;
6716         break;
6717       }
6718
6719     default:
6720       gcc_unreachable ();
6721     }
6722 }
6723
6724 /* Function vectorizable_reduction.
6725
6726    Check if STMT_INFO performs a reduction operation that can be vectorized.
6727    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6728    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6729    Return true if STMT_INFO is vectorizable in this way.
6730
6731    This function also handles reduction idioms (patterns) that have been
6732    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
6733    may be of this form:
6734      X = pattern_expr (arg0, arg1, ..., X)
6735    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6736    sequence that had been detected and replaced by the pattern-stmt
6737    (STMT_INFO).
6738
6739    This function also handles reduction of condition expressions, for example:
6740      for (int i = 0; i < N; i++)
6741        if (a[i] < value)
6742          last = a[i];
6743    This is handled by vectorising the loop and creating an additional vector
6744    containing the loop indexes for which "a[i] < value" was true.  In the
6745    function epilogue this is reduced to a single max value and then used to
6746    index into the vector of results.
6747
6748    In some cases of reduction patterns, the type of the reduction variable X is
6749    different than the type of the other arguments of STMT_INFO.
6750    In such cases, the vectype that is used when transforming STMT_INFO into
6751    a vector stmt is different than the vectype that is used to determine the
6752    vectorization factor, because it consists of a different number of elements
6753    than the actual number of elements that are being operated upon in parallel.
6754
6755    For example, consider an accumulation of shorts into an int accumulator.
6756    On some targets it's possible to vectorize this pattern operating on 8
6757    shorts at a time (hence, the vectype for purposes of determining the
6758    vectorization factor should be V8HI); on the other hand, the vectype that
6759    is used to create the vector form is actually V4SI (the type of the result).
6760
6761    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6762    indicates what is the actual level of parallelism (V8HI in the example), so
6763    that the right vectorization factor would be derived.  This vectype
6764    corresponds to the type of arguments to the reduction stmt, and should *NOT*
6765    be used to create the vectorized stmt.  The right vectype for the vectorized
6766    stmt is obtained from the type of the result X:
6767       get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6768
6769    This means that, contrary to "regular" reductions (or "regular" stmts in
6770    general), the following equation:
6771       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6772    does *NOT* necessarily hold for reduction patterns.  */
6773
6774 bool
6775 vectorizable_reduction (loop_vec_info loop_vinfo,
6776                         stmt_vec_info stmt_info, slp_tree slp_node,
6777                         slp_instance slp_node_instance,
6778                         stmt_vector_for_cost *cost_vec)
6779 {
6780   tree vectype_in = NULL_TREE;
6781   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6782   enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
6783   stmt_vec_info cond_stmt_vinfo = NULL;
6784   int i;
6785   int ncopies;
6786   bool single_defuse_cycle = false;
6787   bool nested_cycle = false;
6788   bool double_reduc = false;
6789   int vec_num;
6790   tree tem;
6791   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6792   tree cond_reduc_val = NULL_TREE;
6793
6794   /* Make sure it was already recognized as a reduction computation.  */
6795   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6796       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
6797       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6798     return false;
6799
6800   /* The stmt we store reduction analysis meta on.  */
6801   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
6802   reduc_info->is_reduc_info = true;
6803
6804   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
6805     {
6806       if (is_a <gphi *> (stmt_info->stmt))
6807         {
6808           if (slp_node)
6809             {
6810               /* We eventually need to set a vector type on invariant
6811                  arguments.  */
6812               unsigned j;
6813               slp_tree child;
6814               FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
6815                 if (!vect_maybe_update_slp_op_vectype
6816                        (child, SLP_TREE_VECTYPE (slp_node)))
6817                   {
6818                     if (dump_enabled_p ())
6819                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6820                                        "incompatible vector types for "
6821                                        "invariants\n");
6822                     return false;
6823                   }
6824             }
6825           /* Analysis for double-reduction is done on the outer
6826              loop PHI, nested cycles have no further restrictions.  */
6827           STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
6828         }
6829       else
6830         STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6831       return true;
6832     }
6833
6834   stmt_vec_info orig_stmt_of_analysis = stmt_info;
6835   stmt_vec_info phi_info = stmt_info;
6836   if (!is_a <gphi *> (stmt_info->stmt))
6837     {
6838       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6839       return true;
6840     }
6841   if (slp_node)
6842     {
6843       slp_node_instance->reduc_phis = slp_node;
6844       /* ???  We're leaving slp_node to point to the PHIs, we only
6845          need it to get at the number of vector stmts which wasn't
6846          yet initialized for the instance root.  */
6847     }
6848   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
6849     {
6850       use_operand_p use_p;
6851       gimple *use_stmt;
6852       bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
6853                                  &use_p, &use_stmt);
6854       gcc_assert (res);
6855       phi_info = loop_vinfo->lookup_stmt (use_stmt);
6856     }
6857
6858   /* PHIs should not participate in patterns.  */
6859   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6860   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6861
6862   /* Verify following REDUC_IDX from the latch def leads us back to the PHI
6863      and compute the reduction chain length.  Discover the real
6864      reduction operation stmt on the way (stmt_info and slp_for_stmt_info).  */
6865   tree reduc_def
6866     = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6867                              loop_latch_edge
6868                                (gimple_bb (reduc_def_phi)->loop_father));
6869   unsigned reduc_chain_length = 0;
6870   bool only_slp_reduc_chain = true;
6871   stmt_info = NULL;
6872   slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
6873   while (reduc_def != PHI_RESULT (reduc_def_phi))
6874     {
6875       stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
6876       stmt_vec_info vdef = vect_stmt_to_vectorize (def);
6877       if (STMT_VINFO_REDUC_IDX (vdef) == -1)
6878         {
6879           if (dump_enabled_p ())
6880             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6881                              "reduction chain broken by patterns.\n");
6882           return false;
6883         }
6884       if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
6885         only_slp_reduc_chain = false;
6886       /* For epilogue generation live members of the chain need
6887          to point back to the PHI via their original stmt for
6888          info_for_reduction to work.  For SLP we need to look at
6889          all lanes here - even though we only will vectorize from
6890          the SLP node with live lane zero the other live lanes also
6891          need to be identified as part of a reduction to be able
6892          to skip code generation for them.  */
6893       if (slp_for_stmt_info)
6894         {
6895           for (auto s : SLP_TREE_SCALAR_STMTS (slp_for_stmt_info))
6896             if (STMT_VINFO_LIVE_P (s))
6897               STMT_VINFO_REDUC_DEF (vect_orig_stmt (s)) = phi_info;
6898         }
6899       else if (STMT_VINFO_LIVE_P (vdef))
6900         STMT_VINFO_REDUC_DEF (def) = phi_info;
6901       gimple_match_op op;
6902       if (!gimple_extract_op (vdef->stmt, &op))
6903         {
6904           if (dump_enabled_p ())
6905             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6906                              "reduction chain includes unsupported"
6907                              " statement type.\n");
6908           return false;
6909         }
6910       if (CONVERT_EXPR_CODE_P (op.code))
6911         {
6912           if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
6913             {
6914               if (dump_enabled_p ())
6915                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6916                                  "conversion in the reduction chain.\n");
6917               return false;
6918             }
6919         }
6920       else if (!stmt_info)
6921         /* First non-conversion stmt.  */
6922         stmt_info = vdef;
6923       reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
6924       reduc_chain_length++;
6925       if (!stmt_info && slp_node)
6926         slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
6927     }
6928   /* PHIs should not participate in patterns.  */
6929   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6930
6931   if (nested_in_vect_loop_p (loop, stmt_info))
6932     {
6933       loop = loop->inner;
6934       nested_cycle = true;
6935     }
6936
6937   /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
6938      element.  */
6939   if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6940     {
6941       gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
6942       stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
6943     }
6944   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6945     gcc_assert (slp_node
6946                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6947
6948   /* 1. Is vectorizable reduction?  */
6949   /* Not supportable if the reduction variable is used in the loop, unless
6950      it's a reduction chain.  */
6951   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6952       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6953     return false;
6954
6955   /* Reductions that are not used even in an enclosing outer-loop,
6956      are expected to be "live" (used out of the loop).  */
6957   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6958       && !STMT_VINFO_LIVE_P (stmt_info))
6959     return false;
6960
6961   /* 2. Has this been recognized as a reduction pattern?
6962
6963      Check if STMT represents a pattern that has been recognized
6964      in earlier analysis stages.  For stmts that represent a pattern,
6965      the STMT_VINFO_RELATED_STMT field records the last stmt in
6966      the original sequence that constitutes the pattern.  */
6967
6968   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6969   if (orig_stmt_info)
6970     {
6971       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6972       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6973     }
6974
6975   /* 3. Check the operands of the operation.  The first operands are defined
6976         inside the loop body. The last operand is the reduction variable,
6977         which is defined by the loop-header-phi.  */
6978
6979   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6980   STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
6981   gimple_match_op op;
6982   if (!gimple_extract_op (stmt_info->stmt, &op))
6983     gcc_unreachable ();
6984   bool lane_reduc_code_p = (op.code == DOT_PROD_EXPR
6985                             || op.code == WIDEN_SUM_EXPR
6986                             || op.code == SAD_EXPR);
6987
6988   if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
6989       && !SCALAR_FLOAT_TYPE_P (op.type))
6990     return false;
6991
6992   /* Do not try to vectorize bit-precision reductions.  */
6993   if (!type_has_mode_precision_p (op.type))
6994     return false;
6995
6996   /* For lane-reducing ops we're reducing the number of reduction PHIs
6997      which means the only use of that may be in the lane-reducing operation.  */
6998   if (lane_reduc_code_p
6999       && reduc_chain_length != 1
7000       && !only_slp_reduc_chain)
7001     {
7002       if (dump_enabled_p ())
7003         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7004                          "lane-reducing reduction with extra stmts.\n");
7005       return false;
7006     }
7007
7008   /* All uses but the last are expected to be defined in the loop.
7009      The last use is the reduction variable.  In case of nested cycle this
7010      assumption is not true: we use reduc_index to record the index of the
7011      reduction variable.  */
7012   slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
7013   /* We need to skip an extra operand for COND_EXPRs with embedded
7014      comparison.  */
7015   unsigned opno_adjust = 0;
7016   if (op.code == COND_EXPR && COMPARISON_CLASS_P (op.ops[0]))
7017     opno_adjust = 1;
7018   for (i = 0; i < (int) op.num_ops; i++)
7019     {
7020       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
7021       if (i == 0 && op.code == COND_EXPR)
7022         continue;
7023
7024       stmt_vec_info def_stmt_info;
7025       enum vect_def_type dt;
7026       if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
7027                                i + opno_adjust, &op.ops[i], &slp_op[i], &dt,
7028                                &tem, &def_stmt_info))
7029         {
7030           if (dump_enabled_p ())
7031             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7032                              "use not simple.\n");
7033           return false;
7034         }
7035       if (i == STMT_VINFO_REDUC_IDX (stmt_info))
7036         continue;
7037
7038       /* There should be only one cycle def in the stmt, the one
7039          leading to reduc_def.  */
7040       if (VECTORIZABLE_CYCLE_DEF (dt))
7041         return false;
7042
7043       /* To properly compute ncopies we are interested in the widest
7044          non-reduction input type in case we're looking at a widening
7045          accumulation that we later handle in vect_transform_reduction.  */
7046       if (lane_reduc_code_p
7047           && tem
7048           && (!vectype_in
7049               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
7050                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem))))))
7051         vectype_in = tem;
7052
7053       if (op.code == COND_EXPR)
7054         {
7055           /* Record how the non-reduction-def value of COND_EXPR is defined.  */
7056           if (dt == vect_constant_def)
7057             {
7058               cond_reduc_dt = dt;
7059               cond_reduc_val = op.ops[i];
7060             }
7061           if (dt == vect_induction_def
7062               && def_stmt_info
7063               && is_nonwrapping_integer_induction (def_stmt_info, loop))
7064             {
7065               cond_reduc_dt = dt;
7066               cond_stmt_vinfo = def_stmt_info;
7067             }
7068         }
7069     }
7070   if (!vectype_in)
7071     vectype_in = STMT_VINFO_VECTYPE (phi_info);
7072   STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
7073
7074   enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
7075   STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
7076   /* If we have a condition reduction, see if we can simplify it further.  */
7077   if (v_reduc_type == COND_REDUCTION)
7078     {
7079       if (slp_node)
7080         return false;
7081
7082       /* When the condition uses the reduction value in the condition, fail.  */
7083       if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
7084         {
7085           if (dump_enabled_p ())
7086             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7087                              "condition depends on previous iteration\n");
7088           return false;
7089         }
7090
7091       if (reduc_chain_length == 1
7092           && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
7093                                              vectype_in, OPTIMIZE_FOR_SPEED))
7094         {
7095           if (dump_enabled_p ())
7096             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7097                              "optimizing condition reduction with"
7098                              " FOLD_EXTRACT_LAST.\n");
7099           STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
7100         }
7101       else if (cond_reduc_dt == vect_induction_def)
7102         {
7103           tree base
7104             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
7105           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
7106
7107           gcc_assert (TREE_CODE (base) == INTEGER_CST
7108                       && TREE_CODE (step) == INTEGER_CST);
7109           cond_reduc_val = NULL_TREE;
7110           enum tree_code cond_reduc_op_code = ERROR_MARK;
7111           tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
7112           if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
7113             ;
7114           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
7115              above base; punt if base is the minimum value of the type for
7116              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
7117           else if (tree_int_cst_sgn (step) == -1)
7118             {
7119               cond_reduc_op_code = MIN_EXPR;
7120               if (tree_int_cst_sgn (base) == -1)
7121                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7122               else if (tree_int_cst_lt (base,
7123                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
7124                 cond_reduc_val
7125                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
7126             }
7127           else
7128             {
7129               cond_reduc_op_code = MAX_EXPR;
7130               if (tree_int_cst_sgn (base) == 1)
7131                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7132               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
7133                                         base))
7134                 cond_reduc_val
7135                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
7136             }
7137           if (cond_reduc_val)
7138             {
7139               if (dump_enabled_p ())
7140                 dump_printf_loc (MSG_NOTE, vect_location,
7141                                  "condition expression based on "
7142                                  "integer induction.\n");
7143               STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
7144               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
7145                 = cond_reduc_val;
7146               STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
7147             }
7148         }
7149       else if (cond_reduc_dt == vect_constant_def)
7150         {
7151           enum vect_def_type cond_initial_dt;
7152           tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
7153           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
7154           if (cond_initial_dt == vect_constant_def
7155               && types_compatible_p (TREE_TYPE (cond_initial_val),
7156                                      TREE_TYPE (cond_reduc_val)))
7157             {
7158               tree e = fold_binary (LE_EXPR, boolean_type_node,
7159                                     cond_initial_val, cond_reduc_val);
7160               if (e && (integer_onep (e) || integer_zerop (e)))
7161                 {
7162                   if (dump_enabled_p ())
7163                     dump_printf_loc (MSG_NOTE, vect_location,
7164                                      "condition expression based on "
7165                                      "compile time constant.\n");
7166                   /* Record reduction code at analysis stage.  */
7167                   STMT_VINFO_REDUC_CODE (reduc_info)
7168                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
7169                   STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
7170                 }
7171             }
7172         }
7173     }
7174
7175   if (STMT_VINFO_LIVE_P (phi_info))
7176     return false;
7177
7178   if (slp_node)
7179     ncopies = 1;
7180   else
7181     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7182
7183   gcc_assert (ncopies >= 1);
7184
7185   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
7186
7187   if (nested_cycle)
7188     {
7189       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
7190                   == vect_double_reduction_def);
7191       double_reduc = true;
7192     }
7193
7194   /* 4.2. Check support for the epilog operation.
7195
7196           If STMT represents a reduction pattern, then the type of the
7197           reduction variable may be different than the type of the rest
7198           of the arguments.  For example, consider the case of accumulation
7199           of shorts into an int accumulator; The original code:
7200                         S1: int_a = (int) short_a;
7201           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
7202
7203           was replaced with:
7204                         STMT: int_acc = widen_sum <short_a, int_acc>
7205
7206           This means that:
7207           1. The tree-code that is used to create the vector operation in the
7208              epilog code (that reduces the partial results) is not the
7209              tree-code of STMT, but is rather the tree-code of the original
7210              stmt from the pattern that STMT is replacing.  I.e, in the example
7211              above we want to use 'widen_sum' in the loop, but 'plus' in the
7212              epilog.
7213           2. The type (mode) we use to check available target support
7214              for the vector operation to be created in the *epilog*, is
7215              determined by the type of the reduction variable (in the example
7216              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7217              However the type (mode) we use to check available target support
7218              for the vector operation to be created *inside the loop*, is
7219              determined by the type of the other arguments to STMT (in the
7220              example we'd check this: optab_handler (widen_sum_optab,
7221              vect_short_mode)).
7222
7223           This is contrary to "regular" reductions, in which the types of all
7224           the arguments are the same as the type of the reduction variable.
7225           For "regular" reductions we can therefore use the same vector type
7226           (and also the same tree-code) when generating the epilog code and
7227           when generating the code inside the loop.  */
7228
7229   code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
7230   STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
7231
7232   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7233   if (reduction_type == TREE_CODE_REDUCTION)
7234     {
7235       /* Check whether it's ok to change the order of the computation.
7236          Generally, when vectorizing a reduction we change the order of the
7237          computation.  This may change the behavior of the program in some
7238          cases, so we need to check that this is ok.  One exception is when
7239          vectorizing an outer-loop: the inner-loop is executed sequentially,
7240          and therefore vectorizing reductions in the inner-loop during
7241          outer-loop vectorization is safe.  Likewise when we are vectorizing
7242          a series of reductions using SLP and the VF is one the reductions
7243          are performed in scalar order.  */
7244       if (slp_node
7245           && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7246           && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
7247         ;
7248       else if (needs_fold_left_reduction_p (op.type, orig_code))
7249         {
7250           /* When vectorizing a reduction chain w/o SLP the reduction PHI
7251              is not directy used in stmt.  */
7252           if (!only_slp_reduc_chain
7253               && reduc_chain_length != 1)
7254             {
7255               if (dump_enabled_p ())
7256                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7257                                  "in-order reduction chain without SLP.\n");
7258               return false;
7259             }
7260           STMT_VINFO_REDUC_TYPE (reduc_info)
7261             = reduction_type = FOLD_LEFT_REDUCTION;
7262         }
7263       else if (!commutative_binary_op_p (orig_code, op.type)
7264                || !associative_binary_op_p (orig_code, op.type))
7265         {
7266           if (dump_enabled_p ())
7267             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7268                             "reduction: not commutative/associative");
7269           return false;
7270         }
7271     }
7272
7273   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7274       && ncopies > 1)
7275     {
7276       if (dump_enabled_p ())
7277         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7278                          "multiple types in double reduction or condition "
7279                          "reduction or fold-left reduction.\n");
7280       return false;
7281     }
7282
7283   internal_fn reduc_fn = IFN_LAST;
7284   if (reduction_type == TREE_CODE_REDUCTION
7285       || reduction_type == FOLD_LEFT_REDUCTION
7286       || reduction_type == INTEGER_INDUC_COND_REDUCTION
7287       || reduction_type == CONST_COND_REDUCTION)
7288     {
7289       if (reduction_type == FOLD_LEFT_REDUCTION
7290           ? fold_left_reduction_fn (orig_code, &reduc_fn)
7291           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7292         {
7293           if (reduc_fn != IFN_LAST
7294               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
7295                                                   OPTIMIZE_FOR_SPEED))
7296             {
7297               if (dump_enabled_p ())
7298                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7299                                  "reduc op not supported by target.\n");
7300
7301               reduc_fn = IFN_LAST;
7302             }
7303         }
7304       else
7305         {
7306           if (!nested_cycle || double_reduc)
7307             {
7308               if (dump_enabled_p ())
7309                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7310                                  "no reduc code for scalar code.\n");
7311
7312               return false;
7313             }
7314         }
7315     }
7316   else if (reduction_type == COND_REDUCTION)
7317     {
7318       int scalar_precision
7319         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
7320       cr_index_scalar_type = make_unsigned_type (scalar_precision);
7321       cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
7322                                                 vectype_out);
7323
7324       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7325                                           OPTIMIZE_FOR_SPEED))
7326         reduc_fn = IFN_REDUC_MAX;
7327     }
7328   STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
7329
7330   if (reduction_type != EXTRACT_LAST_REDUCTION
7331       && (!nested_cycle || double_reduc)
7332       && reduc_fn == IFN_LAST
7333       && !nunits_out.is_constant ())
7334     {
7335       if (dump_enabled_p ())
7336         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7337                          "missing target support for reduction on"
7338                          " variable-length vectors.\n");
7339       return false;
7340     }
7341
7342   /* For SLP reductions, see if there is a neutral value we can use.  */
7343   tree neutral_op = NULL_TREE;
7344   if (slp_node)
7345     {
7346       tree initial_value = NULL_TREE;
7347       if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
7348         initial_value = vect_phi_initial_value (reduc_def_phi);
7349       neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7350                                              orig_code, initial_value);
7351     }
7352
7353   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7354     {
7355       /* We can't support in-order reductions of code such as this:
7356
7357            for (int i = 0; i < n1; ++i)
7358              for (int j = 0; j < n2; ++j)
7359                l += a[j];
7360
7361          since GCC effectively transforms the loop when vectorizing:
7362
7363            for (int i = 0; i < n1 / VF; ++i)
7364              for (int j = 0; j < n2; ++j)
7365                for (int k = 0; k < VF; ++k)
7366                  l += a[j];
7367
7368          which is a reassociation of the original operation.  */
7369       if (dump_enabled_p ())
7370         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7371                          "in-order double reduction not supported.\n");
7372
7373       return false;
7374     }
7375
7376   if (reduction_type == FOLD_LEFT_REDUCTION
7377       && slp_node
7378       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7379     {
7380       /* We cannot use in-order reductions in this case because there is
7381          an implicit reassociation of the operations involved.  */
7382       if (dump_enabled_p ())
7383         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7384                          "in-order unchained SLP reductions not supported.\n");
7385       return false;
7386     }
7387
7388   /* For double reductions, and for SLP reductions with a neutral value,
7389      we construct a variable-length initial vector by loading a vector
7390      full of the neutral value and then shift-and-inserting the start
7391      values into the low-numbered elements.  */
7392   if ((double_reduc || neutral_op)
7393       && !nunits_out.is_constant ()
7394       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7395                                           vectype_out, OPTIMIZE_FOR_SPEED))
7396     {
7397       if (dump_enabled_p ())
7398         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7399                          "reduction on variable-length vectors requires"
7400                          " target support for a vector-shift-and-insert"
7401                          " operation.\n");
7402       return false;
7403     }
7404
7405   /* Check extra constraints for variable-length unchained SLP reductions.  */
7406   if (STMT_SLP_TYPE (stmt_info)
7407       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7408       && !nunits_out.is_constant ())
7409     {
7410       /* We checked above that we could build the initial vector when
7411          there's a neutral element value.  Check here for the case in
7412          which each SLP statement has its own initial value and in which
7413          that value needs to be repeated for every instance of the
7414          statement within the initial vector.  */
7415       unsigned int group_size = SLP_TREE_LANES (slp_node);
7416       if (!neutral_op
7417           && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
7418                                               TREE_TYPE (vectype_out)))
7419         {
7420           if (dump_enabled_p ())
7421             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7422                              "unsupported form of SLP reduction for"
7423                              " variable-length vectors: cannot build"
7424                              " initial vector.\n");
7425           return false;
7426         }
7427       /* The epilogue code relies on the number of elements being a multiple
7428          of the group size.  The duplicate-and-interleave approach to setting
7429          up the initial vector does too.  */
7430       if (!multiple_p (nunits_out, group_size))
7431         {
7432           if (dump_enabled_p ())
7433             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7434                              "unsupported form of SLP reduction for"
7435                              " variable-length vectors: the vector size"
7436                              " is not a multiple of the number of results.\n");
7437           return false;
7438         }
7439     }
7440
7441   if (reduction_type == COND_REDUCTION)
7442     {
7443       widest_int ni;
7444
7445       if (! max_loop_iterations (loop, &ni))
7446         {
7447           if (dump_enabled_p ())
7448             dump_printf_loc (MSG_NOTE, vect_location,
7449                              "loop count not known, cannot create cond "
7450                              "reduction.\n");
7451           return false;
7452         }
7453       /* Convert backedges to iterations.  */
7454       ni += 1;
7455
7456       /* The additional index will be the same type as the condition.  Check
7457          that the loop can fit into this less one (because we'll use up the
7458          zero slot for when there are no matches).  */
7459       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7460       if (wi::geu_p (ni, wi::to_widest (max_index)))
7461         {
7462           if (dump_enabled_p ())
7463             dump_printf_loc (MSG_NOTE, vect_location,
7464                              "loop size is greater than data size.\n");
7465           return false;
7466         }
7467     }
7468
7469   /* In case the vectorization factor (VF) is bigger than the number
7470      of elements that we can fit in a vectype (nunits), we have to generate
7471      more than one vector stmt - i.e - we need to "unroll" the
7472      vector stmt by a factor VF/nunits.  For more details see documentation
7473      in vectorizable_operation.  */
7474
7475   /* If the reduction is used in an outer loop we need to generate
7476      VF intermediate results, like so (e.g. for ncopies=2):
7477         r0 = phi (init, r0)
7478         r1 = phi (init, r1)
7479         r0 = x0 + r0;
7480         r1 = x1 + r1;
7481     (i.e. we generate VF results in 2 registers).
7482     In this case we have a separate def-use cycle for each copy, and therefore
7483     for each copy we get the vector def for the reduction variable from the
7484     respective phi node created for this copy.
7485
7486     Otherwise (the reduction is unused in the loop nest), we can combine
7487     together intermediate results, like so (e.g. for ncopies=2):
7488         r = phi (init, r)
7489         r = x0 + r;
7490         r = x1 + r;
7491    (i.e. we generate VF/2 results in a single register).
7492    In this case for each copy we get the vector def for the reduction variable
7493    from the vectorized reduction operation generated in the previous iteration.
7494
7495    This only works when we see both the reduction PHI and its only consumer
7496    in vectorizable_reduction and there are no intermediate stmts
7497    participating.  When unrolling we want each unrolled iteration to have its
7498    own reduction accumulator since one of the main goals of unrolling a
7499    reduction is to reduce the aggregate loop-carried latency.  */
7500   if (ncopies > 1
7501       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7502       && reduc_chain_length == 1
7503       && loop_vinfo->suggested_unroll_factor == 1)
7504     single_defuse_cycle = true;
7505
7506   if (single_defuse_cycle || lane_reduc_code_p)
7507     {
7508       gcc_assert (op.code != COND_EXPR);
7509
7510       /* 4. Supportable by target?  */
7511       bool ok = true;
7512
7513       /* 4.1. check support for the operation in the loop
7514
7515          This isn't necessary for the lane reduction codes, since they
7516          can only be produced by pattern matching, and it's up to the
7517          pattern matcher to test for support.  The main reason for
7518          specifically skipping this step is to avoid rechecking whether
7519          mixed-sign dot-products can be implemented using signed
7520          dot-products.  */
7521       machine_mode vec_mode = TYPE_MODE (vectype_in);
7522       if (!lane_reduc_code_p
7523           && !directly_supported_p (op.code, vectype_in, optab_vector))
7524         {
7525           if (dump_enabled_p ())
7526             dump_printf (MSG_NOTE, "op not supported by target.\n");
7527           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
7528               || !vect_can_vectorize_without_simd_p (op.code))
7529             ok = false;
7530           else
7531             if (dump_enabled_p ())
7532               dump_printf (MSG_NOTE, "proceeding using word mode.\n");
7533         }
7534
7535       if (vect_emulated_vector_p (vectype_in)
7536           && !vect_can_vectorize_without_simd_p (op.code))
7537         {
7538           if (dump_enabled_p ())
7539             dump_printf (MSG_NOTE, "using word mode not possible.\n");
7540           return false;
7541         }
7542
7543       /* lane-reducing operations have to go through vect_transform_reduction.
7544          For the other cases try without the single cycle optimization.  */
7545       if (!ok)
7546         {
7547           if (lane_reduc_code_p)
7548             return false;
7549           else
7550             single_defuse_cycle = false;
7551         }
7552     }
7553   STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
7554
7555   /* If the reduction stmt is one of the patterns that have lane
7556      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
7557   if ((ncopies > 1 && ! single_defuse_cycle)
7558       && lane_reduc_code_p)
7559     {
7560       if (dump_enabled_p ())
7561         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7562                          "multi def-use cycle not possible for lane-reducing "
7563                          "reduction operation\n");
7564       return false;
7565     }
7566
7567   if (slp_node
7568       && !(!single_defuse_cycle
7569            && !lane_reduc_code_p
7570            && reduction_type != FOLD_LEFT_REDUCTION))
7571     for (i = 0; i < (int) op.num_ops; i++)
7572       if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_in))
7573         {
7574           if (dump_enabled_p ())
7575             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7576                              "incompatible vector types for invariants\n");
7577           return false;
7578         }
7579
7580   if (slp_node)
7581     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7582   else
7583     vec_num = 1;
7584
7585   vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
7586                              reduction_type, ncopies, cost_vec);
7587   /* Cost the reduction op inside the loop if transformed via
7588      vect_transform_reduction.  Otherwise this is costed by the
7589      separate vectorizable_* routines.  */
7590   if (single_defuse_cycle || lane_reduc_code_p)
7591     {
7592       int factor = 1;
7593       if (vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info))
7594         /* Three dot-products and a subtraction.  */
7595         factor = 4;
7596       record_stmt_cost (cost_vec, ncopies * factor, vector_stmt,
7597                         stmt_info, 0, vect_body);
7598     }
7599
7600   if (dump_enabled_p ()
7601       && reduction_type == FOLD_LEFT_REDUCTION)
7602     dump_printf_loc (MSG_NOTE, vect_location,
7603                      "using an in-order (fold-left) reduction.\n");
7604   STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
7605   /* All but single defuse-cycle optimized, lane-reducing and fold-left
7606      reductions go through their own vectorizable_* routines.  */
7607   if (!single_defuse_cycle
7608       && !lane_reduc_code_p
7609       && reduction_type != FOLD_LEFT_REDUCTION)
7610     {
7611       stmt_vec_info tem
7612         = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
7613       if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
7614         {
7615           gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
7616           tem = REDUC_GROUP_FIRST_ELEMENT (tem);
7617         }
7618       STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
7619       STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
7620     }
7621   else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
7622     {
7623       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7624       internal_fn cond_fn = get_conditional_internal_fn (op.code, op.type);
7625
7626       if (reduction_type != FOLD_LEFT_REDUCTION
7627           && !use_mask_by_cond_expr_p (op.code, cond_fn, vectype_in)
7628           && (cond_fn == IFN_LAST
7629               || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7630                                                   OPTIMIZE_FOR_SPEED)))
7631         {
7632           if (dump_enabled_p ())
7633             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7634                              "can't operate on partial vectors because"
7635                              " no conditional operation is available.\n");
7636           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7637         }
7638       else if (reduction_type == FOLD_LEFT_REDUCTION
7639                && reduc_fn == IFN_LAST
7640                && !expand_vec_cond_expr_p (vectype_in,
7641                                            truth_type_for (vectype_in),
7642                                            SSA_NAME))
7643         {
7644           if (dump_enabled_p ())
7645             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7646                              "can't operate on partial vectors because"
7647                              " no conditional operation is available.\n");
7648           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7649         }
7650       else
7651         vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7652                                vectype_in, NULL);
7653     }
7654   return true;
7655 }
7656
7657 /* STMT_INFO is a dot-product reduction whose multiplication operands
7658    have different signs.  Emit a sequence to emulate the operation
7659    using a series of signed DOT_PROD_EXPRs and return the last
7660    statement generated.  VEC_DEST is the result of the vector operation
7661    and VOP lists its inputs.  */
7662
7663 static gassign *
7664 vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
7665                              gimple_stmt_iterator *gsi, tree vec_dest,
7666                              tree vop[3])
7667 {
7668   tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest));
7669   tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0]));
7670   tree narrow_elttype = TREE_TYPE (narrow_vectype);
7671   gimple *new_stmt;
7672
7673   /* Make VOP[0] the unsigned operand VOP[1] the signed operand.  */
7674   if (!TYPE_UNSIGNED (TREE_TYPE (vop[0])))
7675     std::swap (vop[0], vop[1]);
7676
7677   /* Convert all inputs to signed types.  */
7678   for (int i = 0; i < 3; ++i)
7679     if (TYPE_UNSIGNED (TREE_TYPE (vop[i])))
7680       {
7681         tree tmp = make_ssa_name (signed_type_for (TREE_TYPE (vop[i])));
7682         new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]);
7683         vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7684         vop[i] = tmp;
7685       }
7686
7687   /* In the comments below we assume 8-bit inputs for simplicity,
7688      but the approach works for any full integer type.  */
7689
7690   /* Create a vector of -128.  */
7691   tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype);
7692   tree min_narrow = build_vector_from_val (narrow_vectype,
7693                                            min_narrow_elttype);
7694
7695   /* Create a vector of 64.  */
7696   auto half_wi = wi::lrshift (wi::to_wide (min_narrow_elttype), 1);
7697   tree half_narrow = wide_int_to_tree (narrow_elttype, half_wi);
7698   half_narrow = build_vector_from_val (narrow_vectype, half_narrow);
7699
7700   /* Emit: SUB_RES = VOP[0] - 128.  */
7701   tree sub_res = make_ssa_name (narrow_vectype);
7702   new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow);
7703   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7704
7705   /* Emit:
7706
7707        STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
7708        STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
7709        STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
7710
7711      on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
7712      Doing the two 64 * y steps first allows more time to compute x.  */
7713   tree stage1 = make_ssa_name (wide_vectype);
7714   new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR,
7715                                   vop[1], half_narrow, vop[2]);
7716   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7717
7718   tree stage2 = make_ssa_name (wide_vectype);
7719   new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR,
7720                                   vop[1], half_narrow, stage1);
7721   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7722
7723   tree stage3 = make_ssa_name (wide_vectype);
7724   new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR,
7725                                   sub_res, vop[1], stage2);
7726   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7727
7728   /* Convert STAGE3 to the reduction type.  */
7729   return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3);
7730 }
7731
7732 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
7733    value.  */
7734
7735 bool
7736 vect_transform_reduction (loop_vec_info loop_vinfo,
7737                           stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
7738                           gimple **vec_stmt, slp_tree slp_node)
7739 {
7740   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7741   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7742   int i;
7743   int ncopies;
7744   int vec_num;
7745
7746   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7747   gcc_assert (reduc_info->is_reduc_info);
7748
7749   if (nested_in_vect_loop_p (loop, stmt_info))
7750     {
7751       loop = loop->inner;
7752       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
7753     }
7754
7755   gimple_match_op op;
7756   if (!gimple_extract_op (stmt_info->stmt, &op))
7757     gcc_unreachable ();
7758
7759   /* All uses but the last are expected to be defined in the loop.
7760      The last use is the reduction variable.  In case of nested cycle this
7761      assumption is not true: we use reduc_index to record the index of the
7762      reduction variable.  */
7763   stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
7764   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7765   int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
7766   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7767
7768   if (slp_node)
7769     {
7770       ncopies = 1;
7771       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7772     }
7773   else
7774     {
7775       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7776       vec_num = 1;
7777     }
7778
7779   code_helper code = canonicalize_code (op.code, op.type);
7780   internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
7781   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7782   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
7783
7784   /* Transform.  */
7785   tree new_temp = NULL_TREE;
7786   auto_vec<tree> vec_oprnds0;
7787   auto_vec<tree> vec_oprnds1;
7788   auto_vec<tree> vec_oprnds2;
7789   tree def0;
7790
7791   if (dump_enabled_p ())
7792     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7793
7794   /* FORNOW: Multiple types are not supported for condition.  */
7795   if (code == COND_EXPR)
7796     gcc_assert (ncopies == 1);
7797
7798   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7799
7800   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7801   if (reduction_type == FOLD_LEFT_REDUCTION)
7802     {
7803       internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
7804       gcc_assert (code.is_tree_code ());
7805       return vectorize_fold_left_reduction
7806           (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
7807            tree_code (code), reduc_fn, op.ops, vectype_in, reduc_index, masks);
7808     }
7809
7810   bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
7811   gcc_assert (single_defuse_cycle
7812               || code == DOT_PROD_EXPR
7813               || code == WIDEN_SUM_EXPR
7814               || code == SAD_EXPR);
7815
7816   /* Create the destination vector  */
7817   tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
7818   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7819
7820   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
7821                      single_defuse_cycle && reduc_index == 0
7822                      ? NULL_TREE : op.ops[0], &vec_oprnds0,
7823                      single_defuse_cycle && reduc_index == 1
7824                      ? NULL_TREE : op.ops[1], &vec_oprnds1,
7825                      op.num_ops == 3
7826                      && !(single_defuse_cycle && reduc_index == 2)
7827                      ? op.ops[2] : NULL_TREE, &vec_oprnds2);
7828   if (single_defuse_cycle)
7829     {
7830       gcc_assert (!slp_node);
7831       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7832                                      op.ops[reduc_index],
7833                                      reduc_index == 0 ? &vec_oprnds0
7834                                      : (reduc_index == 1 ? &vec_oprnds1
7835                                         : &vec_oprnds2));
7836     }
7837
7838   bool emulated_mixed_dot_prod
7839     = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
7840   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7841     {
7842       gimple *new_stmt;
7843       tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7844       if (masked_loop_p && !mask_by_cond_expr)
7845         {
7846           /* No conditional ifns have been defined for dot-product yet.  */
7847           gcc_assert (code != DOT_PROD_EXPR);
7848
7849           /* Make sure that the reduction accumulator is vop[0].  */
7850           if (reduc_index == 1)
7851             {
7852               gcc_assert (commutative_binary_op_p (code, op.type));
7853               std::swap (vop[0], vop[1]);
7854             }
7855           tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7856                                           vectype_in, i);
7857           gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7858                                                     vop[0], vop[1], vop[0]);
7859           new_temp = make_ssa_name (vec_dest, call);
7860           gimple_call_set_lhs (call, new_temp);
7861           gimple_call_set_nothrow (call, true);
7862           vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
7863           new_stmt = call;
7864         }
7865       else
7866         {
7867           if (op.num_ops == 3)
7868             vop[2] = vec_oprnds2[i];
7869
7870           if (masked_loop_p && mask_by_cond_expr)
7871             {
7872               tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7873                                               vectype_in, i);
7874               build_vect_cond_expr (code, vop, mask, gsi);
7875             }
7876
7877           if (emulated_mixed_dot_prod)
7878             new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
7879                                                     vec_dest, vop);
7880           else if (code.is_internal_fn ())
7881             new_stmt = gimple_build_call_internal (internal_fn (code),
7882                                                    op.num_ops,
7883                                                    vop[0], vop[1], vop[2]);
7884           else
7885             new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
7886                                             vop[0], vop[1], vop[2]);
7887           new_temp = make_ssa_name (vec_dest, new_stmt);
7888           gimple_set_lhs (new_stmt, new_temp);
7889           vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7890         }
7891
7892       if (slp_node)
7893         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7894       else if (single_defuse_cycle
7895                && i < ncopies - 1)
7896         {
7897           if (reduc_index == 0)
7898             vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
7899           else if (reduc_index == 1)
7900             vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
7901           else if (reduc_index == 2)
7902             vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
7903         }
7904       else
7905         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7906     }
7907
7908   if (!slp_node)
7909     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7910
7911   return true;
7912 }
7913
7914 /* Transform phase of a cycle PHI.  */
7915
7916 bool
7917 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
7918                           stmt_vec_info stmt_info, gimple **vec_stmt,
7919                           slp_tree slp_node, slp_instance slp_node_instance)
7920 {
7921   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7922   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7923   int i;
7924   int ncopies;
7925   int j;
7926   bool nested_cycle = false;
7927   int vec_num;
7928
7929   if (nested_in_vect_loop_p (loop, stmt_info))
7930     {
7931       loop = loop->inner;
7932       nested_cycle = true;
7933     }
7934
7935   stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
7936   reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
7937   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7938   gcc_assert (reduc_info->is_reduc_info);
7939
7940   if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
7941       || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
7942     /* Leave the scalar phi in place.  */
7943     return true;
7944
7945   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7946   /* For a nested cycle we do not fill the above.  */
7947   if (!vectype_in)
7948     vectype_in = STMT_VINFO_VECTYPE (stmt_info);
7949   gcc_assert (vectype_in);
7950
7951   if (slp_node)
7952     {
7953       /* The size vect_schedule_slp_instance computes is off for us.  */
7954       vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7955                                       * SLP_TREE_LANES (slp_node), vectype_in);
7956       ncopies = 1;
7957     }
7958   else
7959     {
7960       vec_num = 1;
7961       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7962     }
7963
7964   /* Check whether we should use a single PHI node and accumulate
7965      vectors to one before the backedge.  */
7966   if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
7967     ncopies = 1;
7968
7969   /* Create the destination vector  */
7970   gphi *phi = as_a <gphi *> (stmt_info->stmt);
7971   tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
7972                                                vectype_out);
7973
7974   /* Get the loop-entry arguments.  */
7975   tree vec_initial_def = NULL_TREE;
7976   auto_vec<tree> vec_initial_defs;
7977   if (slp_node)
7978     {
7979       vec_initial_defs.reserve (vec_num);
7980       if (nested_cycle)
7981         {
7982           unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
7983           vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
7984                              &vec_initial_defs);
7985         }
7986       else
7987         {
7988           gcc_assert (slp_node == slp_node_instance->reduc_phis);
7989           vec<tree> &initial_values = reduc_info->reduc_initial_values;
7990           vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
7991
7992           unsigned int num_phis = stmts.length ();
7993           if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
7994             num_phis = 1;
7995           initial_values.reserve (num_phis);
7996           for (unsigned int i = 0; i < num_phis; ++i)
7997             {
7998               gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
7999               initial_values.quick_push (vect_phi_initial_value (this_phi));
8000             }
8001           if (vec_num == 1)
8002             vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8003           if (!initial_values.is_empty ())
8004             {
8005               tree initial_value
8006                 = (num_phis == 1 ? initial_values[0] : NULL_TREE);
8007               code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8008               tree neutral_op
8009                 = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8010                                             code, initial_value);
8011               get_initial_defs_for_reduction (loop_vinfo, reduc_info,
8012                                               &vec_initial_defs, vec_num,
8013                                               stmts.length (), neutral_op);
8014             }
8015         }
8016     }
8017   else
8018     {
8019       /* Get at the scalar def before the loop, that defines the initial
8020          value of the reduction variable.  */
8021       tree initial_def = vect_phi_initial_value (phi);
8022       reduc_info->reduc_initial_values.safe_push (initial_def);
8023       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
8024          and we can't use zero for induc_val, use initial_def.  Similarly
8025          for REDUC_MIN and initial_def larger than the base.  */
8026       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
8027         {
8028           tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
8029           if (TREE_CODE (initial_def) == INTEGER_CST
8030               && !integer_zerop (induc_val)
8031               && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
8032                    && tree_int_cst_lt (initial_def, induc_val))
8033                   || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
8034                       && tree_int_cst_lt (induc_val, initial_def))))
8035             {
8036               induc_val = initial_def;
8037               /* Communicate we used the initial_def to epilouge
8038                  generation.  */
8039               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
8040             }
8041           vec_initial_def = build_vector_from_val (vectype_out, induc_val);
8042         }
8043       else if (nested_cycle)
8044         {
8045           /* Do not use an adjustment def as that case is not supported
8046              correctly if ncopies is not one.  */
8047           vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
8048                                          ncopies, initial_def,
8049                                          &vec_initial_defs);
8050         }
8051       else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
8052                || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
8053         /* Fill the initial vector with the initial scalar value.  */
8054         vec_initial_def
8055           = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
8056                                            initial_def, initial_def);
8057       else
8058         {
8059           if (ncopies == 1)
8060             vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8061           if (!reduc_info->reduc_initial_values.is_empty ())
8062             {
8063               initial_def = reduc_info->reduc_initial_values[0];
8064               code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8065               tree neutral_op
8066                 = neutral_op_for_reduction (TREE_TYPE (initial_def),
8067                                             code, initial_def);
8068               gcc_assert (neutral_op);
8069               /* Try to simplify the vector initialization by applying an
8070                  adjustment after the reduction has been performed.  */
8071               if (!reduc_info->reused_accumulator
8072                   && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8073                   && !operand_equal_p (neutral_op, initial_def))
8074                 {
8075                   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
8076                     = initial_def;
8077                   initial_def = neutral_op;
8078                 }
8079               vec_initial_def
8080                 = get_initial_def_for_reduction (loop_vinfo, reduc_info,
8081                                                  initial_def, neutral_op);
8082             }
8083         }
8084     }
8085
8086   if (vec_initial_def)
8087     {
8088       vec_initial_defs.create (ncopies);
8089       for (i = 0; i < ncopies; ++i)
8090         vec_initial_defs.quick_push (vec_initial_def);
8091     }
8092
8093   if (auto *accumulator = reduc_info->reused_accumulator)
8094     {
8095       tree def = accumulator->reduc_input;
8096       if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8097         {
8098           unsigned int nreduc;
8099           bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
8100                                             (TREE_TYPE (def)),
8101                                           TYPE_VECTOR_SUBPARTS (vectype_out),
8102                                           &nreduc);
8103           gcc_assert (res);
8104           gimple_seq stmts = NULL;
8105           /* Reduce the single vector to a smaller one.  */
8106           if (nreduc != 1)
8107             {
8108               /* Perform the reduction in the appropriate type.  */
8109               tree rvectype = vectype_out;
8110               if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
8111                                               TREE_TYPE (TREE_TYPE (def))))
8112                 rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
8113                                               TYPE_VECTOR_SUBPARTS
8114                                                 (vectype_out));
8115               def = vect_create_partial_epilog (def, rvectype,
8116                                                 STMT_VINFO_REDUC_CODE
8117                                                   (reduc_info),
8118                                                 &stmts);
8119             }
8120           /* The epilogue loop might use a different vector mode, like
8121              VNx2DI vs. V2DI.  */
8122           if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
8123             {
8124               tree reduc_type = build_vector_type_for_mode
8125                 (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
8126               def = gimple_convert (&stmts, reduc_type, def);
8127             }
8128           /* Adjust the input so we pick up the partially reduced value
8129              for the skip edge in vect_create_epilog_for_reduction.  */
8130           accumulator->reduc_input = def;
8131           /* And the reduction could be carried out using a different sign.  */
8132           if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8133             def = gimple_convert (&stmts, vectype_out, def);
8134           if (loop_vinfo->main_loop_edge)
8135             {
8136               /* While we'd like to insert on the edge this will split
8137                  blocks and disturb bookkeeping, we also will eventually
8138                  need this on the skip edge.  Rely on sinking to
8139                  fixup optimal placement and insert in the pred.  */
8140               gimple_stmt_iterator gsi
8141                 = gsi_last_bb (loop_vinfo->main_loop_edge->src);
8142               /* Insert before a cond that eventually skips the
8143                  epilogue.  */
8144               if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
8145                 gsi_prev (&gsi);
8146               gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
8147             }
8148           else
8149             gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
8150                                               stmts);
8151         }
8152       if (loop_vinfo->main_loop_edge)
8153         vec_initial_defs[0]
8154           = vect_get_main_loop_result (loop_vinfo, def,
8155                                        vec_initial_defs[0]);
8156       else
8157         vec_initial_defs.safe_push (def);
8158     }
8159
8160   /* Generate the reduction PHIs upfront.  */
8161   for (i = 0; i < vec_num; i++)
8162     {
8163       tree vec_init_def = vec_initial_defs[i];
8164       for (j = 0; j < ncopies; j++)
8165         {
8166           /* Create the reduction-phi that defines the reduction
8167              operand.  */
8168           gphi *new_phi = create_phi_node (vec_dest, loop->header);
8169
8170           /* Set the loop-entry arg of the reduction-phi.  */
8171           if (j != 0 && nested_cycle)
8172             vec_init_def = vec_initial_defs[j];
8173           add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
8174                        UNKNOWN_LOCATION);
8175
8176           /* The loop-latch arg is set in epilogue processing.  */
8177
8178           if (slp_node)
8179             SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
8180           else
8181             {
8182               if (j == 0)
8183                 *vec_stmt = new_phi;
8184               STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8185             }
8186         }
8187     }
8188
8189   return true;
8190 }
8191
8192 /* Vectorizes LC PHIs.  */
8193
8194 bool
8195 vectorizable_lc_phi (loop_vec_info loop_vinfo,
8196                      stmt_vec_info stmt_info, gimple **vec_stmt,
8197                      slp_tree slp_node)
8198 {
8199   if (!loop_vinfo
8200       || !is_a <gphi *> (stmt_info->stmt)
8201       || gimple_phi_num_args (stmt_info->stmt) != 1)
8202     return false;
8203
8204   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
8205       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
8206     return false;
8207
8208   if (!vec_stmt) /* transformation not required.  */
8209     {
8210       /* Deal with copies from externs or constants that disguise as
8211          loop-closed PHI nodes (PR97886).  */
8212       if (slp_node
8213           && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
8214                                                 SLP_TREE_VECTYPE (slp_node)))
8215         {
8216           if (dump_enabled_p ())
8217             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8218                              "incompatible vector types for invariants\n");
8219           return false;
8220         }
8221       STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
8222       return true;
8223     }
8224
8225   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8226   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8227   basic_block bb = gimple_bb (stmt_info->stmt);
8228   edge e = single_pred_edge (bb);
8229   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8230   auto_vec<tree> vec_oprnds;
8231   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
8232                      !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
8233                      gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
8234   for (unsigned i = 0; i < vec_oprnds.length (); i++)
8235     {
8236       /* Create the vectorized LC PHI node.  */
8237       gphi *new_phi = create_phi_node (vec_dest, bb);
8238       add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
8239       if (slp_node)
8240         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
8241       else
8242         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8243     }
8244   if (!slp_node)
8245     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8246
8247   return true;
8248 }
8249
8250 /* Vectorizes PHIs.  */
8251
8252 bool
8253 vectorizable_phi (vec_info *,
8254                   stmt_vec_info stmt_info, gimple **vec_stmt,
8255                   slp_tree slp_node, stmt_vector_for_cost *cost_vec)
8256 {
8257   if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
8258     return false;
8259
8260   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
8261     return false;
8262
8263   tree vectype = SLP_TREE_VECTYPE (slp_node);
8264
8265   if (!vec_stmt) /* transformation not required.  */
8266     {
8267       slp_tree child;
8268       unsigned i;
8269       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
8270         if (!child)
8271           {
8272             if (dump_enabled_p ())
8273               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8274                                "PHI node with unvectorized backedge def\n");
8275             return false;
8276           }
8277         else if (!vect_maybe_update_slp_op_vectype (child, vectype))
8278           {
8279             if (dump_enabled_p ())
8280               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8281                                "incompatible vector types for invariants\n");
8282             return false;
8283           }
8284         else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8285                  && !useless_type_conversion_p (vectype,
8286                                                 SLP_TREE_VECTYPE (child)))
8287           {
8288             /* With bools we can have mask and non-mask precision vectors
8289                or different non-mask precisions.  while pattern recog is
8290                supposed to guarantee consistency here bugs in it can cause
8291                mismatches (PR103489 and PR103800 for example).
8292                Deal with them here instead of ICEing later.  */
8293             if (dump_enabled_p ())
8294               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8295                                "incompatible vector type setup from "
8296                                "bool pattern detection\n");
8297             return false;
8298           }
8299
8300       /* For single-argument PHIs assume coalescing which means zero cost
8301          for the scalar and the vector PHIs.  This avoids artificially
8302          favoring the vector path (but may pessimize it in some cases).  */
8303       if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
8304         record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
8305                           vector_stmt, stmt_info, vectype, 0, vect_body);
8306       STMT_VINFO_TYPE (stmt_info) = phi_info_type;
8307       return true;
8308     }
8309
8310   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8311   basic_block bb = gimple_bb (stmt_info->stmt);
8312   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8313   auto_vec<gphi *> new_phis;
8314   for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
8315     {
8316       slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8317
8318       /* Skip not yet vectorized defs.  */
8319       if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8320           && SLP_TREE_VEC_STMTS (child).is_empty ())
8321         continue;
8322
8323       auto_vec<tree> vec_oprnds;
8324       vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
8325       if (!new_phis.exists ())
8326         {
8327           new_phis.create (vec_oprnds.length ());
8328           for (unsigned j = 0; j < vec_oprnds.length (); j++)
8329             {
8330               /* Create the vectorized LC PHI node.  */
8331               new_phis.quick_push (create_phi_node (vec_dest, bb));
8332               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phis[j]);
8333             }
8334         }
8335       edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
8336       for (unsigned j = 0; j < vec_oprnds.length (); j++)
8337         add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
8338     }
8339   /* We should have at least one already vectorized child.  */
8340   gcc_assert (new_phis.exists ());
8341
8342   return true;
8343 }
8344
8345 /* Vectorizes first order recurrences.  An overview of the transformation
8346    is described below. Suppose we have the following loop.
8347
8348      int t = 0;
8349      for (int i = 0; i < n; ++i)
8350        {
8351          b[i] = a[i] - t;
8352          t = a[i];
8353        }
8354
8355    There is a first-order recurrence on 'a'. For this loop, the scalar IR
8356    looks (simplified) like:
8357
8358     scalar.preheader:
8359       init = 0;
8360
8361     scalar.body:
8362       i = PHI <0(scalar.preheader), i+1(scalar.body)>
8363       _2 = PHI <(init(scalar.preheader), <_1(scalar.body)>
8364       _1 = a[i]
8365       b[i] = _1 - _2
8366       if (i < n) goto scalar.body
8367
8368    In this example, _2 is a recurrence because it's value depends on the
8369    previous iteration.  We vectorize this as (VF = 4)
8370
8371     vector.preheader:
8372       vect_init = vect_cst(..., ..., ..., 0)
8373
8374     vector.body
8375       i = PHI <0(vector.preheader), i+4(vector.body)>
8376       vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)>
8377       vect_2 = a[i, i+1, i+2, i+3];
8378       vect_3 = vec_perm (vect_1, vect_2, { 3, 4, 5, 6 })
8379       b[i, i+1, i+2, i+3] = vect_2 - vect_3
8380       if (..) goto vector.body
8381
8382    In this function, vectorizable_recurr, we code generate both the
8383    vector PHI node and the permute since those together compute the
8384    vectorized value of the scalar PHI.  We do not yet have the
8385    backedge value to fill in there nor into the vec_perm.  Those
8386    are filled in maybe_set_vectorized_backedge_value and
8387    vect_schedule_scc.
8388
8389    TODO:  Since the scalar loop does not have a use of the recurrence
8390    outside of the loop the natural way to implement peeling via
8391    vectorizing the live value doesn't work.  For now peeling of loops
8392    with a recurrence is not implemented.  For SLP the supported cases
8393    are restricted to those requiring a single vector recurrence PHI.  */
8394
8395 bool
8396 vectorizable_recurr (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8397                      gimple **vec_stmt, slp_tree slp_node,
8398                      stmt_vector_for_cost *cost_vec)
8399 {
8400   if (!loop_vinfo || !is_a<gphi *> (stmt_info->stmt))
8401     return false;
8402
8403   gphi *phi = as_a<gphi *> (stmt_info->stmt);
8404
8405   /* So far we only support first-order recurrence auto-vectorization.  */
8406   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
8407     return false;
8408
8409   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8410   unsigned ncopies;
8411   if (slp_node)
8412     ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8413   else
8414     ncopies = vect_get_num_copies (loop_vinfo, vectype);
8415   poly_int64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8416   unsigned dist = slp_node ? SLP_TREE_LANES (slp_node) : 1;
8417   /* We need to be able to make progress with a single vector.  */
8418   if (maybe_gt (dist * 2, nunits))
8419     {
8420       if (dump_enabled_p ())
8421         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8422                          "first order recurrence exceeds half of "
8423                          "a vector\n");
8424       return false;
8425     }
8426
8427   /* First-order recurrence autovectorization needs to handle permutation
8428      with indices = [nunits-1, nunits, nunits+1, ...].  */
8429   vec_perm_builder sel (nunits, 1, 3);
8430   for (int i = 0; i < 3; ++i)
8431     sel.quick_push (nunits - dist + i);
8432   vec_perm_indices indices (sel, 2, nunits);
8433
8434   if (!vec_stmt) /* transformation not required.  */
8435     {
8436       if (!can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
8437                                  indices))
8438         return false;
8439
8440       if (slp_node)
8441         {
8442           /* We eventually need to set a vector type on invariant
8443              arguments.  */
8444           unsigned j;
8445           slp_tree child;
8446           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
8447             if (!vect_maybe_update_slp_op_vectype
8448                   (child, SLP_TREE_VECTYPE (slp_node)))
8449               {
8450                 if (dump_enabled_p ())
8451                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8452                                    "incompatible vector types for "
8453                                    "invariants\n");
8454                 return false;
8455               }
8456         }
8457       /* The recurrence costs the initialization vector and one permute
8458          for each copy.  */
8459       unsigned prologue_cost = record_stmt_cost (cost_vec, 1, scalar_to_vec,
8460                                                  stmt_info, 0, vect_prologue);
8461       unsigned inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
8462                                                stmt_info, 0, vect_body);
8463       if (dump_enabled_p ())
8464         dump_printf_loc (MSG_NOTE, vect_location,
8465                          "vectorizable_recurr: inside_cost = %d, "
8466                          "prologue_cost = %d .\n", inside_cost,
8467                          prologue_cost);
8468
8469       STMT_VINFO_TYPE (stmt_info) = recurr_info_type;
8470       return true;
8471     }
8472
8473   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
8474   basic_block bb = gimple_bb (phi);
8475   tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
8476   if (!useless_type_conversion_p (TREE_TYPE (vectype), TREE_TYPE (preheader)))
8477     {
8478       gimple_seq stmts = NULL;
8479       preheader = gimple_convert (&stmts, TREE_TYPE (vectype), preheader);
8480       gsi_insert_seq_on_edge_immediate (pe, stmts);
8481     }
8482   tree vec_init = build_vector_from_val (vectype, preheader);
8483   vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL);
8484
8485   /* Create the vectorized first-order PHI node.  */
8486   tree vec_dest = vect_get_new_vect_var (vectype,
8487                                          vect_simple_var, "vec_recur_");
8488   gphi *new_phi = create_phi_node (vec_dest, bb);
8489   add_phi_arg (new_phi, vec_init, pe, UNKNOWN_LOCATION);
8490
8491   /* Insert shuffles the first-order recurrence autovectorization.
8492        result = VEC_PERM <vec_recur, vect_1, index[nunits-1, nunits, ...]>.  */
8493   tree perm = vect_gen_perm_mask_checked (vectype, indices);
8494
8495   /* Insert the required permute after the latch definition.  The
8496      second and later operands are tentative and will be updated when we have
8497      vectorized the latch definition.  */
8498   edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
8499   gimple *latch_def = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, le));
8500   gimple_stmt_iterator gsi2 = gsi_for_stmt (latch_def);
8501   gsi_next (&gsi2);
8502
8503   for (unsigned i = 0; i < ncopies; ++i)
8504     {
8505       vec_dest = make_ssa_name (vectype);
8506       gassign *vperm
8507           = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
8508                                  i == 0 ? gimple_phi_result (new_phi) : NULL,
8509                                  NULL, perm);
8510       vect_finish_stmt_generation (loop_vinfo, stmt_info, vperm, &gsi2);
8511
8512       if (slp_node)
8513         SLP_TREE_VEC_STMTS (slp_node).quick_push (vperm);
8514       else
8515         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (vperm);
8516     }
8517
8518   if (!slp_node)
8519     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8520   return true;
8521 }
8522
8523 /* Return true if VECTYPE represents a vector that requires lowering
8524    by the vector lowering pass.  */
8525
8526 bool
8527 vect_emulated_vector_p (tree vectype)
8528 {
8529   return (!VECTOR_MODE_P (TYPE_MODE (vectype))
8530           && (!VECTOR_BOOLEAN_TYPE_P (vectype)
8531               || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
8532 }
8533
8534 /* Return true if we can emulate CODE on an integer mode representation
8535    of a vector.  */
8536
8537 bool
8538 vect_can_vectorize_without_simd_p (tree_code code)
8539 {
8540   switch (code)
8541     {
8542     case PLUS_EXPR:
8543     case MINUS_EXPR:
8544     case NEGATE_EXPR:
8545     case BIT_AND_EXPR:
8546     case BIT_IOR_EXPR:
8547     case BIT_XOR_EXPR:
8548     case BIT_NOT_EXPR:
8549       return true;
8550
8551     default:
8552       return false;
8553     }
8554 }
8555
8556 /* Likewise, but taking a code_helper.  */
8557
8558 bool
8559 vect_can_vectorize_without_simd_p (code_helper code)
8560 {
8561   return (code.is_tree_code ()
8562           && vect_can_vectorize_without_simd_p (tree_code (code)));
8563 }
8564
8565 /* Create vector init for vectorized iv.  */
8566 static tree
8567 vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
8568                                tree step_expr, poly_uint64 nunits,
8569                                tree vectype,
8570                                enum vect_induction_op_type induction_type)
8571 {
8572   unsigned HOST_WIDE_INT const_nunits;
8573   tree vec_shift, vec_init, new_name;
8574   unsigned i;
8575   tree itype = TREE_TYPE (vectype);
8576
8577   /* iv_loop is the loop to be vectorized. Create:
8578      vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr).  */
8579   new_name = gimple_convert (stmts, itype, init_expr);
8580   switch (induction_type)
8581     {
8582     case vect_step_op_shr:
8583     case vect_step_op_shl:
8584       /* Build the Initial value from shift_expr.  */
8585       vec_init = gimple_build_vector_from_val (stmts,
8586                                                vectype,
8587                                                new_name);
8588       vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
8589                                 build_zero_cst (itype), step_expr);
8590       vec_init = gimple_build (stmts,
8591                                (induction_type == vect_step_op_shr
8592                                 ? RSHIFT_EXPR : LSHIFT_EXPR),
8593                                vectype, vec_init, vec_shift);
8594       break;
8595
8596     case vect_step_op_neg:
8597       {
8598         vec_init = gimple_build_vector_from_val (stmts,
8599                                                  vectype,
8600                                                  new_name);
8601         tree vec_neg = gimple_build (stmts, NEGATE_EXPR,
8602                                      vectype, vec_init);
8603         /* The encoding has 2 interleaved stepped patterns.  */
8604         vec_perm_builder sel (nunits, 2, 3);
8605         sel.quick_grow (6);
8606         for (i = 0; i < 3; i++)
8607           {
8608             sel[2 * i] = i;
8609             sel[2 * i + 1] = i + nunits;
8610           }
8611         vec_perm_indices indices (sel, 2, nunits);
8612         /* Don't use vect_gen_perm_mask_checked since can_vec_perm_const_p may
8613            fail when vec_init is const vector. In that situation vec_perm is not
8614            really needed.  */
8615         tree perm_mask_even
8616           = vect_gen_perm_mask_any (vectype, indices);
8617         vec_init = gimple_build (stmts, VEC_PERM_EXPR,
8618                                  vectype,
8619                                  vec_init, vec_neg,
8620                                  perm_mask_even);
8621       }
8622       break;
8623
8624     case vect_step_op_mul:
8625       {
8626         /* Use unsigned mult to avoid UD integer overflow.  */
8627         gcc_assert (nunits.is_constant (&const_nunits));
8628         tree utype = unsigned_type_for (itype);
8629         tree uvectype = build_vector_type (utype,
8630                                            TYPE_VECTOR_SUBPARTS (vectype));
8631         new_name = gimple_convert (stmts, utype, new_name);
8632         vec_init = gimple_build_vector_from_val (stmts,
8633                                                  uvectype,
8634                                                  new_name);
8635         tree_vector_builder elts (uvectype, const_nunits, 1);
8636         tree elt_step = build_one_cst (utype);
8637
8638         elts.quick_push (elt_step);
8639         for (i = 1; i < const_nunits; i++)
8640           {
8641             /* Create: new_name_i = new_name + step_expr.  */
8642             elt_step = gimple_build (stmts, MULT_EXPR,
8643                                      utype, elt_step, step_expr);
8644             elts.quick_push (elt_step);
8645           }
8646         /* Create a vector from [new_name_0, new_name_1, ...,
8647            new_name_nunits-1].  */
8648         tree vec_mul = gimple_build_vector (stmts, &elts);
8649         vec_init = gimple_build (stmts, MULT_EXPR, uvectype,
8650                                  vec_init, vec_mul);
8651         vec_init = gimple_convert (stmts, vectype, vec_init);
8652       }
8653       break;
8654
8655     default:
8656       gcc_unreachable ();
8657     }
8658
8659   return vec_init;
8660 }
8661
8662 /* Peel init_expr by skip_niter for induction_type.  */
8663 tree
8664 vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
8665                              tree skip_niters, tree step_expr,
8666                              enum vect_induction_op_type induction_type)
8667 {
8668   gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST);
8669   tree type = TREE_TYPE (init_expr);
8670   unsigned prec = TYPE_PRECISION (type);
8671   switch (induction_type)
8672     {
8673     case vect_step_op_neg:
8674       if (TREE_INT_CST_LOW (skip_niters) % 2)
8675         init_expr = gimple_build (stmts, NEGATE_EXPR, type, init_expr);
8676       /* else no change.  */
8677       break;
8678
8679     case vect_step_op_shr:
8680     case vect_step_op_shl:
8681       skip_niters = gimple_convert (stmts, type, skip_niters);
8682       step_expr = gimple_build (stmts, MULT_EXPR, type, step_expr, skip_niters);
8683       /* When shift mount >= precision, need to avoid UD.
8684          In the original loop, there's no UD, and according to semantic,
8685          init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr.  */
8686       if (!tree_fits_uhwi_p (step_expr)
8687           || tree_to_uhwi (step_expr) >= prec)
8688         {
8689           if (induction_type == vect_step_op_shl
8690               || TYPE_UNSIGNED (type))
8691             init_expr = build_zero_cst (type);
8692           else
8693             init_expr = gimple_build (stmts, RSHIFT_EXPR, type,
8694                                       init_expr,
8695                                       wide_int_to_tree (type, prec - 1));
8696         }
8697       else
8698         init_expr = gimple_build (stmts, (induction_type == vect_step_op_shr
8699                                           ? RSHIFT_EXPR : LSHIFT_EXPR),
8700                                   type, init_expr, step_expr);
8701       break;
8702
8703     case vect_step_op_mul:
8704       {
8705         tree utype = unsigned_type_for (type);
8706         init_expr = gimple_convert (stmts, utype, init_expr);
8707         unsigned skipn = TREE_INT_CST_LOW (skip_niters);
8708         wide_int begin = wi::to_wide (step_expr);
8709         for (unsigned i = 0; i != skipn - 1; i++)
8710           begin = wi::mul (begin, wi::to_wide (step_expr));
8711         tree mult_expr = wide_int_to_tree (utype, begin);
8712         init_expr = gimple_build (stmts, MULT_EXPR, utype, init_expr, mult_expr);
8713         init_expr = gimple_convert (stmts, type, init_expr);
8714       }
8715       break;
8716
8717     default:
8718       gcc_unreachable ();
8719     }
8720
8721   return init_expr;
8722 }
8723
8724 /* Create vector step for vectorized iv.  */
8725 static tree
8726 vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
8727                                poly_uint64 vf,
8728                                enum vect_induction_op_type induction_type)
8729 {
8730   tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
8731   tree new_name = NULL;
8732   /* Step should be pow (step, vf) for mult induction.  */
8733   if (induction_type == vect_step_op_mul)
8734     {
8735       gcc_assert (vf.is_constant ());
8736       wide_int begin = wi::to_wide (step_expr);
8737
8738       for (unsigned i = 0; i != vf.to_constant () - 1; i++)
8739         begin = wi::mul (begin, wi::to_wide (step_expr));
8740
8741       new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin);
8742     }
8743   else if (induction_type == vect_step_op_neg)
8744     /* Do nothing.  */
8745     ;
8746   else
8747     new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr),
8748                              expr, step_expr);
8749   return new_name;
8750 }
8751
8752 static tree
8753 vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
8754                                    stmt_vec_info stmt_info,
8755                                    tree new_name, tree vectype,
8756                                    enum vect_induction_op_type induction_type)
8757 {
8758   /* No step is needed for neg induction.  */
8759   if (induction_type == vect_step_op_neg)
8760     return NULL;
8761
8762   tree t = unshare_expr (new_name);
8763   gcc_assert (CONSTANT_CLASS_P (new_name)
8764               || TREE_CODE (new_name) == SSA_NAME);
8765   tree new_vec = build_vector_from_val (vectype, t);
8766   tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
8767                                     new_vec, vectype, NULL);
8768   return vec_step;
8769 }
8770
8771 /* Update vectorized iv with vect_step, induc_def is init.  */
8772 static tree
8773 vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
8774                           tree induc_def, tree vec_step,
8775                           enum vect_induction_op_type induction_type)
8776 {
8777   tree vec_def = induc_def;
8778   switch (induction_type)
8779     {
8780     case vect_step_op_mul:
8781       {
8782         /* Use unsigned mult to avoid UD integer overflow.  */
8783         tree uvectype
8784           = build_vector_type (unsigned_type_for (TREE_TYPE (vectype)),
8785                                TYPE_VECTOR_SUBPARTS (vectype));
8786         vec_def = gimple_convert (stmts, uvectype, vec_def);
8787         vec_step = gimple_convert (stmts, uvectype, vec_step);
8788         vec_def = gimple_build (stmts, MULT_EXPR, uvectype,
8789                                 vec_def, vec_step);
8790         vec_def = gimple_convert (stmts, vectype, vec_def);
8791       }
8792       break;
8793
8794     case vect_step_op_shr:
8795       vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype,
8796                               vec_def, vec_step);
8797       break;
8798
8799     case vect_step_op_shl:
8800       vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype,
8801                               vec_def, vec_step);
8802       break;
8803     case vect_step_op_neg:
8804       vec_def = induc_def;
8805       /* Do nothing.  */
8806       break;
8807     default:
8808       gcc_unreachable ();
8809     }
8810
8811   return vec_def;
8812
8813 }
8814
8815 /* Function vectorizable_induction
8816
8817    Check if STMT_INFO performs an nonlinear induction computation that can be
8818    vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
8819    a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
8820    basic block.
8821    Return true if STMT_INFO is vectorizable in this way.  */
8822
8823 static bool
8824 vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
8825                                   stmt_vec_info stmt_info,
8826                                   gimple **vec_stmt, slp_tree slp_node,
8827                                   stmt_vector_for_cost *cost_vec)
8828 {
8829   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8830   unsigned ncopies;
8831   bool nested_in_vect_loop = false;
8832   class loop *iv_loop;
8833   tree vec_def;
8834   edge pe = loop_preheader_edge (loop);
8835   basic_block new_bb;
8836   tree vec_init, vec_step;
8837   tree new_name;
8838   gimple *new_stmt;
8839   gphi *induction_phi;
8840   tree induc_def, vec_dest;
8841   tree init_expr, step_expr;
8842   tree niters_skip;
8843   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8844   unsigned i;
8845   gimple_stmt_iterator si;
8846
8847   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
8848
8849   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8850   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8851   enum vect_induction_op_type induction_type
8852     = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
8853
8854   gcc_assert (induction_type > vect_step_op_add);
8855
8856   if (slp_node)
8857     ncopies = 1;
8858   else
8859     ncopies = vect_get_num_copies (loop_vinfo, vectype);
8860   gcc_assert (ncopies >= 1);
8861
8862   /* FORNOW. Only handle nonlinear induction in the same loop.  */
8863   if (nested_in_vect_loop_p (loop, stmt_info))
8864     {
8865       if (dump_enabled_p ())
8866         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8867                          "nonlinear induction in nested loop.\n");
8868       return false;
8869     }
8870
8871   iv_loop = loop;
8872   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
8873
8874   /* TODO: Support slp for nonlinear iv. There should be separate vector iv
8875      update for each iv and a permutation to generate wanted vector iv.  */
8876   if (slp_node)
8877     {
8878       if (dump_enabled_p ())
8879         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8880                          "SLP induction not supported for nonlinear"
8881                          " induction.\n");
8882       return false;
8883     }
8884
8885   if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
8886     {
8887       if (dump_enabled_p ())
8888         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8889                          "floating point nonlinear induction vectorization"
8890                          " not supported.\n");
8891       return false;
8892     }
8893
8894   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
8895   init_expr = vect_phi_initial_value (phi);
8896   gcc_assert (step_expr != NULL_TREE && init_expr != NULL
8897               && TREE_CODE (step_expr) == INTEGER_CST);
8898   /* step_expr should be aligned with init_expr,
8899      .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used.  */
8900   step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
8901
8902   if (TREE_CODE (init_expr) == INTEGER_CST)
8903     init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
8904   else
8905     gcc_assert (tree_nop_conversion_p (TREE_TYPE (vectype),
8906                                        TREE_TYPE (init_expr)));
8907
8908   switch (induction_type)
8909     {
8910     case vect_step_op_neg:
8911       if (TREE_CODE (init_expr) != INTEGER_CST
8912           && TREE_CODE (init_expr) != REAL_CST)
8913         {
8914           /* Check for backend support of NEGATE_EXPR and vec_perm.  */
8915           if (!directly_supported_p (NEGATE_EXPR, vectype))
8916             return false;
8917
8918           /* The encoding has 2 interleaved stepped patterns.  */
8919           vec_perm_builder sel (nunits, 2, 3);
8920           machine_mode mode = TYPE_MODE (vectype);
8921           sel.quick_grow (6);
8922           for (i = 0; i < 3; i++)
8923             {
8924               sel[i * 2] = i;
8925               sel[i * 2 + 1] = i + nunits;
8926             }
8927           vec_perm_indices indices (sel, 2, nunits);
8928           if (!can_vec_perm_const_p (mode, mode, indices))
8929             return false;
8930         }
8931       break;
8932
8933     case vect_step_op_mul:
8934       {
8935         /* Check for backend support of MULT_EXPR.  */
8936         if (!directly_supported_p (MULT_EXPR, vectype))
8937           return false;
8938
8939         /* ?? How to construct vector step for variable number vector.
8940            [ 1, step, pow (step, 2), pow (step, 4), .. ].  */
8941         if (!vf.is_constant ())
8942           return false;
8943       }
8944       break;
8945
8946     case vect_step_op_shr:
8947       /* Check for backend support of RSHIFT_EXPR.  */
8948       if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
8949         return false;
8950
8951       /* Don't shift more than type precision to avoid UD.  */
8952       if (!tree_fits_uhwi_p (step_expr)
8953           || maybe_ge (nunits * tree_to_uhwi (step_expr),
8954                        TYPE_PRECISION (TREE_TYPE (init_expr))))
8955         return false;
8956       break;
8957
8958     case vect_step_op_shl:
8959       /* Check for backend support of RSHIFT_EXPR.  */
8960       if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
8961         return false;
8962
8963       /* Don't shift more than type precision to avoid UD.  */
8964       if (!tree_fits_uhwi_p (step_expr)
8965           || maybe_ge (nunits * tree_to_uhwi (step_expr),
8966                        TYPE_PRECISION (TREE_TYPE (init_expr))))
8967         return false;
8968
8969       break;
8970
8971     default:
8972       gcc_unreachable ();
8973     }
8974
8975   if (!vec_stmt) /* transformation not required.  */
8976     {
8977       unsigned inside_cost = 0, prologue_cost = 0;
8978       /* loop cost for vec_loop. Neg induction doesn't have any
8979          inside_cost.  */
8980       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
8981                                       stmt_info, 0, vect_body);
8982
8983       /* loop cost for vec_loop. Neg induction doesn't have any
8984          inside_cost.  */
8985       if (induction_type == vect_step_op_neg)
8986         inside_cost = 0;
8987
8988       /* prologue cost for vec_init and vec_step.  */
8989       prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
8990                                         stmt_info, 0, vect_prologue);
8991
8992       if (dump_enabled_p ())
8993         dump_printf_loc (MSG_NOTE, vect_location,
8994                          "vect_model_induction_cost: inside_cost = %d, "
8995                          "prologue_cost = %d. \n", inside_cost,
8996                          prologue_cost);
8997
8998       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
8999       DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
9000       return true;
9001     }
9002
9003   /* Transform.  */
9004
9005   /* Compute a vector variable, initialized with the first VF values of
9006      the induction variable.  E.g., for an iv with IV_PHI='X' and
9007      evolution S, for a vector of 4 units, we want to compute:
9008      [X, X + S, X + 2*S, X + 3*S].  */
9009
9010   if (dump_enabled_p ())
9011     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9012
9013   pe = loop_preheader_edge (iv_loop);
9014   /* Find the first insertion point in the BB.  */
9015   basic_block bb = gimple_bb (phi);
9016   si = gsi_after_labels (bb);
9017
9018   gimple_seq stmts = NULL;
9019
9020   niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9021   /* If we are using the loop mask to "peel" for alignment then we need
9022      to adjust the start value here.  */
9023   if (niters_skip != NULL_TREE)
9024     init_expr = vect_peel_nonlinear_iv_init (&stmts, init_expr, niters_skip,
9025                                              step_expr, induction_type);
9026
9027   vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr,
9028                                             step_expr, nunits, vectype,
9029                                             induction_type);
9030   if (stmts)
9031     {
9032       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9033       gcc_assert (!new_bb);
9034     }
9035
9036   stmts = NULL;
9037   new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9038                                             vf, induction_type);
9039   if (stmts)
9040     {
9041       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9042       gcc_assert (!new_bb);
9043     }
9044
9045   vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9046                                                 new_name, vectype,
9047                                                 induction_type);
9048   /* Create the following def-use cycle:
9049      loop prolog:
9050      vec_init = ...
9051      vec_step = ...
9052      loop:
9053      vec_iv = PHI <vec_init, vec_loop>
9054      ...
9055      STMT
9056      ...
9057      vec_loop = vec_iv + vec_step;  */
9058
9059   /* Create the induction-phi that defines the induction-operand.  */
9060   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
9061   induction_phi = create_phi_node (vec_dest, iv_loop->header);
9062   induc_def = PHI_RESULT (induction_phi);
9063
9064   /* Create the iv update inside the loop.  */
9065   stmts = NULL;
9066   vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9067                                       induc_def, vec_step,
9068                                       induction_type);
9069
9070   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9071   new_stmt = SSA_NAME_DEF_STMT (vec_def);
9072
9073   /* Set the arguments of the phi node:  */
9074   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9075   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9076                UNKNOWN_LOCATION);
9077
9078   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
9079   *vec_stmt = induction_phi;
9080
9081   /* In case that vectorization factor (VF) is bigger than the number
9082      of elements that we can fit in a vectype (nunits), we have to generate
9083      more than one vector stmt - i.e - we need to "unroll" the
9084      vector stmt by a factor VF/nunits.  For more details see documentation
9085      in vectorizable_operation.  */
9086
9087   if (ncopies > 1)
9088     {
9089       stmts = NULL;
9090       /* FORNOW. This restriction should be relaxed.  */
9091       gcc_assert (!nested_in_vect_loop);
9092
9093       new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9094                                                 nunits, induction_type);
9095
9096       vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9097                                                     new_name, vectype,
9098                                                     induction_type);
9099       vec_def = induc_def;
9100       for (i = 1; i < ncopies; i++)
9101         {
9102           /* vec_i = vec_prev + vec_step.  */
9103           stmts = NULL;
9104           vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9105                                               vec_def, vec_step,
9106                                               induction_type);
9107           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9108           new_stmt = SSA_NAME_DEF_STMT (vec_def);
9109           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
9110         }
9111     }
9112
9113   if (dump_enabled_p ())
9114     dump_printf_loc (MSG_NOTE, vect_location,
9115                      "transform induction: created def-use cycle: %G%G",
9116                      (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
9117
9118   return true;
9119 }
9120
9121 /* Function vectorizable_induction
9122
9123    Check if STMT_INFO performs an induction computation that can be vectorized.
9124    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
9125    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
9126    Return true if STMT_INFO is vectorizable in this way.  */
9127
9128 bool
9129 vectorizable_induction (loop_vec_info loop_vinfo,
9130                         stmt_vec_info stmt_info,
9131                         gimple **vec_stmt, slp_tree slp_node,
9132                         stmt_vector_for_cost *cost_vec)
9133 {
9134   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9135   unsigned ncopies;
9136   bool nested_in_vect_loop = false;
9137   class loop *iv_loop;
9138   tree vec_def;
9139   edge pe = loop_preheader_edge (loop);
9140   basic_block new_bb;
9141   tree new_vec, vec_init, vec_step, t;
9142   tree new_name;
9143   gimple *new_stmt;
9144   gphi *induction_phi;
9145   tree induc_def, vec_dest;
9146   tree init_expr, step_expr;
9147   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9148   unsigned i;
9149   tree expr;
9150   gimple_stmt_iterator si;
9151   enum vect_induction_op_type induction_type
9152     = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9153
9154   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9155   if (!phi)
9156     return false;
9157
9158   if (!STMT_VINFO_RELEVANT_P (stmt_info))
9159     return false;
9160
9161   /* Make sure it was recognized as induction computation.  */
9162   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
9163     return false;
9164
9165   /* Handle nonlinear induction in a separate place.  */
9166   if (induction_type != vect_step_op_add)
9167     return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
9168                                              vec_stmt, slp_node, cost_vec);
9169
9170   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9171   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9172
9173   if (slp_node)
9174     ncopies = 1;
9175   else
9176     ncopies = vect_get_num_copies (loop_vinfo, vectype);
9177   gcc_assert (ncopies >= 1);
9178
9179   /* FORNOW. These restrictions should be relaxed.  */
9180   if (nested_in_vect_loop_p (loop, stmt_info))
9181     {
9182       imm_use_iterator imm_iter;
9183       use_operand_p use_p;
9184       gimple *exit_phi;
9185       edge latch_e;
9186       tree loop_arg;
9187
9188       if (ncopies > 1)
9189         {
9190           if (dump_enabled_p ())
9191             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9192                              "multiple types in nested loop.\n");
9193           return false;
9194         }
9195
9196       exit_phi = NULL;
9197       latch_e = loop_latch_edge (loop->inner);
9198       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
9199       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
9200         {
9201           gimple *use_stmt = USE_STMT (use_p);
9202           if (is_gimple_debug (use_stmt))
9203             continue;
9204
9205           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
9206             {
9207               exit_phi = use_stmt;
9208               break;
9209             }
9210         }
9211       if (exit_phi)
9212         {
9213           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
9214           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
9215                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
9216             {
9217               if (dump_enabled_p ())
9218                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9219                                  "inner-loop induction only used outside "
9220                                  "of the outer vectorized loop.\n");
9221               return false;
9222             }
9223         }
9224
9225       nested_in_vect_loop = true;
9226       iv_loop = loop->inner;
9227     }
9228   else
9229     iv_loop = loop;
9230   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9231
9232   if (slp_node && !nunits.is_constant ())
9233     {
9234       /* The current SLP code creates the step value element-by-element.  */
9235       if (dump_enabled_p ())
9236         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9237                          "SLP induction not supported for variable-length"
9238                          " vectors.\n");
9239       return false;
9240     }
9241
9242   if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
9243     {
9244       if (dump_enabled_p ())
9245         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9246                          "floating point induction vectorization disabled\n");
9247       return false;
9248     }
9249
9250   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9251   gcc_assert (step_expr != NULL_TREE);
9252   tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
9253
9254   /* Check for backend support of PLUS/MINUS_EXPR. */
9255   if (!directly_supported_p (PLUS_EXPR, step_vectype)
9256       || !directly_supported_p (MINUS_EXPR, step_vectype))
9257     return false;
9258
9259   if (!vec_stmt) /* transformation not required.  */
9260     {
9261       unsigned inside_cost = 0, prologue_cost = 0;
9262       if (slp_node)
9263         {
9264           /* We eventually need to set a vector type on invariant
9265              arguments.  */
9266           unsigned j;
9267           slp_tree child;
9268           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9269             if (!vect_maybe_update_slp_op_vectype
9270                 (child, SLP_TREE_VECTYPE (slp_node)))
9271               {
9272                 if (dump_enabled_p ())
9273                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9274                                    "incompatible vector types for "
9275                                    "invariants\n");
9276                 return false;
9277               }
9278           /* loop cost for vec_loop.  */
9279           inside_cost
9280             = record_stmt_cost (cost_vec,
9281                                 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
9282                                 vector_stmt, stmt_info, 0, vect_body);
9283           /* prologue cost for vec_init (if not nested) and step.  */
9284           prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
9285                                             scalar_to_vec,
9286                                             stmt_info, 0, vect_prologue);
9287         }
9288       else /* if (!slp_node) */
9289         {
9290           /* loop cost for vec_loop.  */
9291           inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9292                                           stmt_info, 0, vect_body);
9293           /* prologue cost for vec_init and vec_step.  */
9294           prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9295                                             stmt_info, 0, vect_prologue);
9296         }
9297       if (dump_enabled_p ())
9298         dump_printf_loc (MSG_NOTE, vect_location,
9299                          "vect_model_induction_cost: inside_cost = %d, "
9300                          "prologue_cost = %d .\n", inside_cost,
9301                          prologue_cost);
9302
9303       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9304       DUMP_VECT_SCOPE ("vectorizable_induction");
9305       return true;
9306     }
9307
9308   /* Transform.  */
9309
9310   /* Compute a vector variable, initialized with the first VF values of
9311      the induction variable.  E.g., for an iv with IV_PHI='X' and
9312      evolution S, for a vector of 4 units, we want to compute:
9313      [X, X + S, X + 2*S, X + 3*S].  */
9314
9315   if (dump_enabled_p ())
9316     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9317
9318   pe = loop_preheader_edge (iv_loop);
9319   /* Find the first insertion point in the BB.  */
9320   basic_block bb = gimple_bb (phi);
9321   si = gsi_after_labels (bb);
9322
9323   /* For SLP induction we have to generate several IVs as for example
9324      with group size 3 we need
9325        [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
9326        [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2].  */
9327   if (slp_node)
9328     {
9329       /* Enforced above.  */
9330       unsigned int const_nunits = nunits.to_constant ();
9331
9332       /* The initial values are vectorized, but any lanes > group_size
9333          need adjustment.  */
9334       slp_tree init_node
9335         = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
9336
9337       /* Gather steps.  Since we do not vectorize inductions as
9338          cycles we have to reconstruct the step from SCEV data.  */
9339       unsigned group_size = SLP_TREE_LANES (slp_node);
9340       tree *steps = XALLOCAVEC (tree, group_size);
9341       tree *inits = XALLOCAVEC (tree, group_size);
9342       stmt_vec_info phi_info;
9343       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
9344         {
9345           steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
9346           if (!init_node)
9347             inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
9348                                            pe->dest_idx);
9349         }
9350
9351       /* Now generate the IVs.  */
9352       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
9353       gcc_assert ((const_nunits * nvects) % group_size == 0);
9354       unsigned nivs;
9355       if (nested_in_vect_loop)
9356         nivs = nvects;
9357       else
9358         {
9359           /* Compute the number of distinct IVs we need.  First reduce
9360              group_size if it is a multiple of const_nunits so we get
9361              one IV for a group_size of 4 but const_nunits 2.  */
9362           unsigned group_sizep = group_size;
9363           if (group_sizep % const_nunits == 0)
9364             group_sizep = group_sizep / const_nunits;
9365           nivs = least_common_multiple (group_sizep,
9366                                         const_nunits) / const_nunits;
9367         }
9368       tree stept = TREE_TYPE (step_vectype);
9369       tree lupdate_mul = NULL_TREE;
9370       if (!nested_in_vect_loop)
9371         {
9372           /* The number of iterations covered in one vector iteration.  */
9373           unsigned lup_mul = (nvects * const_nunits) / group_size;
9374           lupdate_mul
9375             = build_vector_from_val (step_vectype,
9376                                      SCALAR_FLOAT_TYPE_P (stept)
9377                                      ? build_real_from_wide (stept, lup_mul,
9378                                                              UNSIGNED)
9379                                      : build_int_cstu (stept, lup_mul));
9380         }
9381       tree peel_mul = NULL_TREE;
9382       gimple_seq init_stmts = NULL;
9383       if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
9384         {
9385           if (SCALAR_FLOAT_TYPE_P (stept))
9386             peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
9387                                      LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
9388           else
9389             peel_mul = gimple_convert (&init_stmts, stept,
9390                                        LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
9391           peel_mul = gimple_build_vector_from_val (&init_stmts,
9392                                                    step_vectype, peel_mul);
9393         }
9394       unsigned ivn;
9395       auto_vec<tree> vec_steps;
9396       for (ivn = 0; ivn < nivs; ++ivn)
9397         {
9398           tree_vector_builder step_elts (step_vectype, const_nunits, 1);
9399           tree_vector_builder init_elts (vectype, const_nunits, 1);
9400           tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
9401           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
9402             {
9403               /* The scalar steps of the IVs.  */
9404               tree elt = steps[(ivn*const_nunits + eltn) % group_size];
9405               elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
9406               step_elts.quick_push (elt);
9407               if (!init_node)
9408                 {
9409                   /* The scalar inits of the IVs if not vectorized.  */
9410                   elt = inits[(ivn*const_nunits + eltn) % group_size];
9411                   if (!useless_type_conversion_p (TREE_TYPE (vectype),
9412                                                   TREE_TYPE (elt)))
9413                     elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
9414                                         TREE_TYPE (vectype), elt);
9415                   init_elts.quick_push (elt);
9416                 }
9417               /* The number of steps to add to the initial values.  */
9418               unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
9419               mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
9420                                    ? build_real_from_wide (stept,
9421                                                            mul_elt, UNSIGNED)
9422                                    : build_int_cstu (stept, mul_elt));
9423             }
9424           vec_step = gimple_build_vector (&init_stmts, &step_elts);
9425           vec_steps.safe_push (vec_step);
9426           tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
9427           if (peel_mul)
9428             step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
9429                                      step_mul, peel_mul);
9430           if (!init_node)
9431             vec_init = gimple_build_vector (&init_stmts, &init_elts);
9432
9433           /* Create the induction-phi that defines the induction-operand.  */
9434           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
9435                                             "vec_iv_");
9436           induction_phi = create_phi_node (vec_dest, iv_loop->header);
9437           induc_def = PHI_RESULT (induction_phi);
9438
9439           /* Create the iv update inside the loop  */
9440           tree up = vec_step;
9441           if (lupdate_mul)
9442             up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9443                                vec_step, lupdate_mul);
9444           gimple_seq stmts = NULL;
9445           vec_def = gimple_convert (&stmts, step_vectype, induc_def);
9446           vec_def = gimple_build (&stmts,
9447                                   PLUS_EXPR, step_vectype, vec_def, up);
9448           vec_def = gimple_convert (&stmts, vectype, vec_def);
9449           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9450           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9451                        UNKNOWN_LOCATION);
9452
9453           if (init_node)
9454             vec_init = vect_get_slp_vect_def (init_node, ivn);
9455           if (!nested_in_vect_loop
9456               && !integer_zerop (step_mul))
9457             {
9458               vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
9459               up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9460                                  vec_step, step_mul);
9461               vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
9462                                       vec_def, up);
9463               vec_init = gimple_convert (&init_stmts, vectype, vec_def);
9464             }
9465
9466           /* Set the arguments of the phi node:  */
9467           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9468
9469           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
9470         }
9471       if (!nested_in_vect_loop)
9472         {
9473           /* Fill up to the number of vectors we need for the whole group.  */
9474           nivs = least_common_multiple (group_size,
9475                                         const_nunits) / const_nunits;
9476           vec_steps.reserve (nivs-ivn);
9477           for (; ivn < nivs; ++ivn)
9478             {
9479               SLP_TREE_VEC_STMTS (slp_node)
9480                 .quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]);
9481               vec_steps.quick_push (vec_steps[0]);
9482             }
9483         }
9484
9485       /* Re-use IVs when we can.  We are generating further vector
9486          stmts by adding VF' * stride to the IVs generated above.  */
9487       if (ivn < nvects)
9488         {
9489           unsigned vfp
9490             = least_common_multiple (group_size, const_nunits) / group_size;
9491           tree lupdate_mul
9492             = build_vector_from_val (step_vectype,
9493                                      SCALAR_FLOAT_TYPE_P (stept)
9494                                      ? build_real_from_wide (stept,
9495                                                              vfp, UNSIGNED)
9496                                      : build_int_cstu (stept, vfp));
9497           for (; ivn < nvects; ++ivn)
9498             {
9499               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
9500               tree def = gimple_get_lhs (iv);
9501               if (ivn < 2*nivs)
9502                 vec_steps[ivn - nivs]
9503                   = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9504                                   vec_steps[ivn - nivs], lupdate_mul);
9505               gimple_seq stmts = NULL;
9506               def = gimple_convert (&stmts, step_vectype, def);
9507               def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
9508                                   def, vec_steps[ivn % nivs]);
9509               def = gimple_convert (&stmts, vectype, def);
9510               if (gimple_code (iv) == GIMPLE_PHI)
9511                 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9512               else
9513                 {
9514                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
9515                   gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
9516                 }
9517               SLP_TREE_VEC_STMTS (slp_node)
9518                 .quick_push (SSA_NAME_DEF_STMT (def));
9519             }
9520         }
9521
9522       new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
9523       gcc_assert (!new_bb);
9524
9525       return true;
9526     }
9527
9528   init_expr = vect_phi_initial_value (phi);
9529
9530   gimple_seq stmts = NULL;
9531   if (!nested_in_vect_loop)
9532     {
9533       /* Convert the initial value to the IV update type.  */
9534       tree new_type = TREE_TYPE (step_expr);
9535       init_expr = gimple_convert (&stmts, new_type, init_expr);
9536
9537       /* If we are using the loop mask to "peel" for alignment then we need
9538          to adjust the start value here.  */
9539       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9540       if (skip_niters != NULL_TREE)
9541         {
9542           if (FLOAT_TYPE_P (vectype))
9543             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
9544                                         skip_niters);
9545           else
9546             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
9547           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
9548                                          skip_niters, step_expr);
9549           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
9550                                     init_expr, skip_step);
9551         }
9552     }
9553
9554   if (stmts)
9555     {
9556       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9557       gcc_assert (!new_bb);
9558     }
9559
9560   /* Create the vector that holds the initial_value of the induction.  */
9561   if (nested_in_vect_loop)
9562     {
9563       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
9564          been created during vectorization of previous stmts.  We obtain it
9565          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
9566       auto_vec<tree> vec_inits;
9567       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
9568                                      init_expr, &vec_inits);
9569       vec_init = vec_inits[0];
9570       /* If the initial value is not of proper type, convert it.  */
9571       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
9572         {
9573           new_stmt
9574             = gimple_build_assign (vect_get_new_ssa_name (vectype,
9575                                                           vect_simple_var,
9576                                                           "vec_iv_"),
9577                                    VIEW_CONVERT_EXPR,
9578                                    build1 (VIEW_CONVERT_EXPR, vectype,
9579                                            vec_init));
9580           vec_init = gimple_assign_lhs (new_stmt);
9581           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
9582                                                  new_stmt);
9583           gcc_assert (!new_bb);
9584         }
9585     }
9586   else
9587     {
9588       /* iv_loop is the loop to be vectorized. Create:
9589          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
9590       stmts = NULL;
9591       new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
9592
9593       unsigned HOST_WIDE_INT const_nunits;
9594       if (nunits.is_constant (&const_nunits))
9595         {
9596           tree_vector_builder elts (step_vectype, const_nunits, 1);
9597           elts.quick_push (new_name);
9598           for (i = 1; i < const_nunits; i++)
9599             {
9600               /* Create: new_name_i = new_name + step_expr  */
9601               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
9602                                        new_name, step_expr);
9603               elts.quick_push (new_name);
9604             }
9605           /* Create a vector from [new_name_0, new_name_1, ...,
9606              new_name_nunits-1]  */
9607           vec_init = gimple_build_vector (&stmts, &elts);
9608         }
9609       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
9610         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
9611         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
9612                                  new_name, step_expr);
9613       else
9614         {
9615           /* Build:
9616                 [base, base, base, ...]
9617                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
9618           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
9619           gcc_assert (flag_associative_math);
9620           tree index = build_index_vector (step_vectype, 0, 1);
9621           tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
9622                                                         new_name);
9623           tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
9624                                                         step_expr);
9625           vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
9626           vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
9627                                    vec_init, step_vec);
9628           vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
9629                                    vec_init, base_vec);
9630         }
9631       vec_init = gimple_convert (&stmts, vectype, vec_init);
9632
9633       if (stmts)
9634         {
9635           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9636           gcc_assert (!new_bb);
9637         }
9638     }
9639
9640
9641   /* Create the vector that holds the step of the induction.  */
9642   if (nested_in_vect_loop)
9643     /* iv_loop is nested in the loop to be vectorized. Generate:
9644        vec_step = [S, S, S, S]  */
9645     new_name = step_expr;
9646   else
9647     {
9648       /* iv_loop is the loop to be vectorized. Generate:
9649           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
9650       gimple_seq seq = NULL;
9651       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
9652         {
9653           expr = build_int_cst (integer_type_node, vf);
9654           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
9655         }
9656       else
9657         expr = build_int_cst (TREE_TYPE (step_expr), vf);
9658       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
9659                                expr, step_expr);
9660       if (seq)
9661         {
9662           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
9663           gcc_assert (!new_bb);
9664         }
9665     }
9666
9667   t = unshare_expr (new_name);
9668   gcc_assert (CONSTANT_CLASS_P (new_name)
9669               || TREE_CODE (new_name) == SSA_NAME);
9670   new_vec = build_vector_from_val (step_vectype, t);
9671   vec_step = vect_init_vector (loop_vinfo, stmt_info,
9672                                new_vec, step_vectype, NULL);
9673
9674
9675   /* Create the following def-use cycle:
9676      loop prolog:
9677          vec_init = ...
9678          vec_step = ...
9679      loop:
9680          vec_iv = PHI <vec_init, vec_loop>
9681          ...
9682          STMT
9683          ...
9684          vec_loop = vec_iv + vec_step;  */
9685
9686   /* Create the induction-phi that defines the induction-operand.  */
9687   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
9688   induction_phi = create_phi_node (vec_dest, iv_loop->header);
9689   induc_def = PHI_RESULT (induction_phi);
9690
9691   /* Create the iv update inside the loop  */
9692   stmts = NULL;
9693   vec_def = gimple_convert (&stmts, step_vectype, induc_def);
9694   vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
9695   vec_def = gimple_convert (&stmts, vectype, vec_def);
9696   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9697   new_stmt = SSA_NAME_DEF_STMT (vec_def);
9698
9699   /* Set the arguments of the phi node:  */
9700   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9701   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9702                UNKNOWN_LOCATION);
9703
9704   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
9705   *vec_stmt = induction_phi;
9706
9707   /* In case that vectorization factor (VF) is bigger than the number
9708      of elements that we can fit in a vectype (nunits), we have to generate
9709      more than one vector stmt - i.e - we need to "unroll" the
9710      vector stmt by a factor VF/nunits.  For more details see documentation
9711      in vectorizable_operation.  */
9712
9713   if (ncopies > 1)
9714     {
9715       gimple_seq seq = NULL;
9716       /* FORNOW. This restriction should be relaxed.  */
9717       gcc_assert (!nested_in_vect_loop);
9718
9719       /* Create the vector that holds the step of the induction.  */
9720       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
9721         {
9722           expr = build_int_cst (integer_type_node, nunits);
9723           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
9724         }
9725       else
9726         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
9727       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
9728                                expr, step_expr);
9729       if (seq)
9730         {
9731           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
9732           gcc_assert (!new_bb);
9733         }
9734
9735       t = unshare_expr (new_name);
9736       gcc_assert (CONSTANT_CLASS_P (new_name)
9737                   || TREE_CODE (new_name) == SSA_NAME);
9738       new_vec = build_vector_from_val (step_vectype, t);
9739       vec_step = vect_init_vector (loop_vinfo, stmt_info,
9740                                    new_vec, step_vectype, NULL);
9741
9742       vec_def = induc_def;
9743       for (i = 1; i < ncopies; i++)
9744         {
9745           /* vec_i = vec_prev + vec_step  */
9746           gimple_seq stmts = NULL;
9747           vec_def = gimple_convert (&stmts, step_vectype, vec_def);
9748           vec_def = gimple_build (&stmts,
9749                                   PLUS_EXPR, step_vectype, vec_def, vec_step);
9750           vec_def = gimple_convert (&stmts, vectype, vec_def);
9751
9752           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9753           new_stmt = SSA_NAME_DEF_STMT (vec_def);
9754           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
9755         }
9756     }
9757
9758   if (dump_enabled_p ())
9759     dump_printf_loc (MSG_NOTE, vect_location,
9760                      "transform induction: created def-use cycle: %G%G",
9761                      (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
9762
9763   return true;
9764 }
9765
9766 /* Function vectorizable_live_operation.
9767
9768    STMT_INFO computes a value that is used outside the loop.  Check if
9769    it can be supported.  */
9770
9771 bool
9772 vectorizable_live_operation (vec_info *vinfo,
9773                              stmt_vec_info stmt_info,
9774                              gimple_stmt_iterator *gsi,
9775                              slp_tree slp_node, slp_instance slp_node_instance,
9776                              int slp_index, bool vec_stmt_p,
9777                              stmt_vector_for_cost *cost_vec)
9778 {
9779   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
9780   imm_use_iterator imm_iter;
9781   tree lhs, lhs_type, bitsize;
9782   tree vectype = (slp_node
9783                   ? SLP_TREE_VECTYPE (slp_node)
9784                   : STMT_VINFO_VECTYPE (stmt_info));
9785   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9786   int ncopies;
9787   gimple *use_stmt;
9788   auto_vec<tree> vec_oprnds;
9789   int vec_entry = 0;
9790   poly_uint64 vec_index = 0;
9791
9792   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
9793
9794   /* If a stmt of a reduction is live, vectorize it via
9795      vect_create_epilog_for_reduction.  vectorizable_reduction assessed
9796      validity so just trigger the transform here.  */
9797   if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
9798     {
9799       if (!vec_stmt_p)
9800         return true;
9801       if (slp_node)
9802         {
9803           /* For reduction chains the meta-info is attached to
9804              the group leader.  */
9805           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
9806             stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
9807           /* For SLP reductions we vectorize the epilogue for
9808              all involved stmts together.  */
9809           else if (slp_index != 0)
9810             return true;
9811         }
9812       stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
9813       gcc_assert (reduc_info->is_reduc_info);
9814       if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
9815           || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
9816         return true;
9817       vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
9818                                         slp_node_instance);
9819       return true;
9820     }
9821
9822   /* If STMT is not relevant and it is a simple assignment and its inputs are
9823      invariant then it can remain in place, unvectorized.  The original last
9824      scalar value that it computes will be used.  */
9825   if (!STMT_VINFO_RELEVANT_P (stmt_info))
9826     {
9827       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
9828       if (dump_enabled_p ())
9829         dump_printf_loc (MSG_NOTE, vect_location,
9830                          "statement is simple and uses invariant.  Leaving in "
9831                          "place.\n");
9832       return true;
9833     }
9834
9835   if (slp_node)
9836     ncopies = 1;
9837   else
9838     ncopies = vect_get_num_copies (loop_vinfo, vectype);
9839
9840   if (slp_node)
9841     {
9842       gcc_assert (slp_index >= 0);
9843
9844       /* Get the last occurrence of the scalar index from the concatenation of
9845          all the slp vectors. Calculate which slp vector it is and the index
9846          within.  */
9847       int num_scalar = SLP_TREE_LANES (slp_node);
9848       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
9849       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
9850
9851       /* Calculate which vector contains the result, and which lane of
9852          that vector we need.  */
9853       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
9854         {
9855           if (dump_enabled_p ())
9856             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9857                              "Cannot determine which vector holds the"
9858                              " final result.\n");
9859           return false;
9860         }
9861     }
9862
9863   if (!vec_stmt_p)
9864     {
9865       /* No transformation required.  */
9866       if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
9867         {
9868           if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
9869                                                OPTIMIZE_FOR_SPEED))
9870             {
9871               if (dump_enabled_p ())
9872                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9873                                  "can't operate on partial vectors "
9874                                  "because the target doesn't support extract "
9875                                  "last reduction.\n");
9876               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
9877             }
9878           else if (slp_node)
9879             {
9880               if (dump_enabled_p ())
9881                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9882                                  "can't operate on partial vectors "
9883                                  "because an SLP statement is live after "
9884                                  "the loop.\n");
9885               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
9886             }
9887           else if (ncopies > 1)
9888             {
9889               if (dump_enabled_p ())
9890                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9891                                  "can't operate on partial vectors "
9892                                  "because ncopies is greater than 1.\n");
9893               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
9894             }
9895           else
9896             {
9897               gcc_assert (ncopies == 1 && !slp_node);
9898               vect_record_loop_mask (loop_vinfo,
9899                                      &LOOP_VINFO_MASKS (loop_vinfo),
9900                                      1, vectype, NULL);
9901             }
9902         }
9903       /* ???  Enable for loop costing as well.  */
9904       if (!loop_vinfo)
9905         record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
9906                           0, vect_epilogue);
9907       return true;
9908     }
9909
9910   /* Use the lhs of the original scalar statement.  */
9911   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
9912   if (dump_enabled_p ())
9913     dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
9914                      "stmt %G", stmt);
9915
9916   lhs = gimple_get_lhs (stmt);
9917   lhs_type = TREE_TYPE (lhs);
9918
9919   bitsize = vector_element_bits_tree (vectype);
9920
9921   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
9922   tree vec_lhs, bitstart;
9923   gimple *vec_stmt;
9924   if (slp_node)
9925     {
9926       gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
9927
9928       /* Get the correct slp vectorized stmt.  */
9929       vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
9930       vec_lhs = gimple_get_lhs (vec_stmt);
9931
9932       /* Get entry to use.  */
9933       bitstart = bitsize_int (vec_index);
9934       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
9935     }
9936   else
9937     {
9938       /* For multiple copies, get the last copy.  */
9939       vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
9940       vec_lhs = gimple_get_lhs (vec_stmt);
9941
9942       /* Get the last lane in the vector.  */
9943       bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
9944     }
9945
9946   if (loop_vinfo)
9947     {
9948       /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
9949          requirement, insert one phi node for it.  It looks like:
9950            loop;
9951          BB:
9952            # lhs' = PHI <lhs>
9953          ==>
9954            loop;
9955          BB:
9956            # vec_lhs' = PHI <vec_lhs>
9957            new_tree = lane_extract <vec_lhs', ...>;
9958            lhs' = new_tree;  */
9959
9960       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9961       basic_block exit_bb = single_exit (loop)->dest;
9962       gcc_assert (single_pred_p (exit_bb));
9963
9964       tree vec_lhs_phi = copy_ssa_name (vec_lhs);
9965       gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
9966       SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
9967
9968       gimple_seq stmts = NULL;
9969       tree new_tree;
9970       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
9971         {
9972           /* Emit:
9973
9974                SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
9975
9976              where VEC_LHS is the vectorized live-out result and MASK is
9977              the loop mask for the final iteration.  */
9978           gcc_assert (ncopies == 1 && !slp_node);
9979           tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
9980           tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
9981                                           1, vectype, 0);
9982           tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
9983                                           mask, vec_lhs_phi);
9984
9985           /* Convert the extracted vector element to the scalar type.  */
9986           new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
9987         }
9988       else
9989         {
9990           tree bftype = TREE_TYPE (vectype);
9991           if (VECTOR_BOOLEAN_TYPE_P (vectype))
9992             bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
9993           new_tree = build3 (BIT_FIELD_REF, bftype,
9994                              vec_lhs_phi, bitsize, bitstart);
9995           new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
9996                                            &stmts, true, NULL_TREE);
9997         }
9998
9999       if (stmts)
10000         {
10001           gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
10002           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
10003
10004           /* Remove existing phi from lhs and create one copy from new_tree.  */
10005           tree lhs_phi = NULL_TREE;
10006           gimple_stmt_iterator gsi;
10007           for (gsi = gsi_start_phis (exit_bb);
10008                !gsi_end_p (gsi); gsi_next (&gsi))
10009             {
10010               gimple *phi = gsi_stmt (gsi);
10011               if ((gimple_phi_arg_def (phi, 0) == lhs))
10012                 {
10013                   remove_phi_node (&gsi, false);
10014                   lhs_phi = gimple_phi_result (phi);
10015                   gimple *copy = gimple_build_assign (lhs_phi, new_tree);
10016                   gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
10017                   break;
10018                 }
10019             }
10020         }
10021
10022       /* Replace use of lhs with newly computed result.  If the use stmt is a
10023          single arg PHI, just replace all uses of PHI result.  It's necessary
10024          because lcssa PHI defining lhs may be before newly inserted stmt.  */
10025       use_operand_p use_p;
10026       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10027         if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
10028             && !is_gimple_debug (use_stmt))
10029           {
10030             if (gimple_code (use_stmt) == GIMPLE_PHI
10031                 && gimple_phi_num_args (use_stmt) == 1)
10032               {
10033                 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
10034               }
10035             else
10036               {
10037                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10038                     SET_USE (use_p, new_tree);
10039               }
10040             update_stmt (use_stmt);
10041           }
10042     }
10043   else
10044     {
10045       /* For basic-block vectorization simply insert the lane-extraction.  */
10046       tree bftype = TREE_TYPE (vectype);
10047       if (VECTOR_BOOLEAN_TYPE_P (vectype))
10048         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10049       tree new_tree = build3 (BIT_FIELD_REF, bftype,
10050                               vec_lhs, bitsize, bitstart);
10051       gimple_seq stmts = NULL;
10052       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10053                                        &stmts, true, NULL_TREE);
10054       if (TREE_CODE (new_tree) == SSA_NAME
10055           && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
10056         SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
10057       if (is_a <gphi *> (vec_stmt))
10058         {
10059           gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
10060           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10061         }
10062       else
10063         {
10064           gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
10065           gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
10066         }
10067
10068       /* Replace use of lhs with newly computed result.  If the use stmt is a
10069          single arg PHI, just replace all uses of PHI result.  It's necessary
10070          because lcssa PHI defining lhs may be before newly inserted stmt.  */
10071       use_operand_p use_p;
10072       stmt_vec_info use_stmt_info;
10073       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10074         if (!is_gimple_debug (use_stmt)
10075             && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
10076                 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
10077           {
10078             /* ???  This can happen when the live lane ends up being
10079                used in a vector construction code-generated by an
10080                external SLP node (and code-generation for that already
10081                happened).  See gcc.dg/vect/bb-slp-47.c.
10082                Doing this is what would happen if that vector CTOR
10083                were not code-generated yet so it is not too bad.
10084                ???  In fact we'd likely want to avoid this situation
10085                in the first place.  */
10086             if (TREE_CODE (new_tree) == SSA_NAME
10087                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10088                 && gimple_code (use_stmt) != GIMPLE_PHI
10089                 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
10090                                                 use_stmt))
10091               {
10092                 enum tree_code code = gimple_assign_rhs_code (use_stmt);
10093                 gcc_assert (code == CONSTRUCTOR
10094                             || code == VIEW_CONVERT_EXPR
10095                             || CONVERT_EXPR_CODE_P (code));
10096                 if (dump_enabled_p ())
10097                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10098                                    "Using original scalar computation for "
10099                                    "live lane because use preceeds vector "
10100                                    "def\n");
10101                 continue;
10102               }
10103             /* ???  It can also happen that we end up pulling a def into
10104                a loop where replacing out-of-loop uses would require
10105                a new LC SSA PHI node.  Retain the original scalar in
10106                those cases as well.  PR98064.  */
10107             if (TREE_CODE (new_tree) == SSA_NAME
10108                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10109                 && (gimple_bb (use_stmt)->loop_father
10110                     != gimple_bb (vec_stmt)->loop_father)
10111                 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
10112                                         gimple_bb (use_stmt)->loop_father))
10113               {
10114                 if (dump_enabled_p ())
10115                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10116                                    "Using original scalar computation for "
10117                                    "live lane because there is an out-of-loop "
10118                                    "definition for it\n");
10119                 continue;
10120               }
10121             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10122               SET_USE (use_p, new_tree);
10123             update_stmt (use_stmt);
10124           }
10125     }
10126
10127   return true;
10128 }
10129
10130 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
10131
10132 static void
10133 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
10134 {
10135   ssa_op_iter op_iter;
10136   imm_use_iterator imm_iter;
10137   def_operand_p def_p;
10138   gimple *ustmt;
10139
10140   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
10141     {
10142       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
10143         {
10144           basic_block bb;
10145
10146           if (!is_gimple_debug (ustmt))
10147             continue;
10148
10149           bb = gimple_bb (ustmt);
10150
10151           if (!flow_bb_inside_loop_p (loop, bb))
10152             {
10153               if (gimple_debug_bind_p (ustmt))
10154                 {
10155                   if (dump_enabled_p ())
10156                     dump_printf_loc (MSG_NOTE, vect_location,
10157                                      "killing debug use\n");
10158
10159                   gimple_debug_bind_reset_value (ustmt);
10160                   update_stmt (ustmt);
10161                 }
10162               else
10163                 gcc_unreachable ();
10164             }
10165         }
10166     }
10167 }
10168
10169 /* Given loop represented by LOOP_VINFO, return true if computation of
10170    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
10171    otherwise.  */
10172
10173 static bool
10174 loop_niters_no_overflow (loop_vec_info loop_vinfo)
10175 {
10176   /* Constant case.  */
10177   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
10178     {
10179       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
10180       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
10181
10182       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
10183       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
10184       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
10185         return true;
10186     }
10187
10188   widest_int max;
10189   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10190   /* Check the upper bound of loop niters.  */
10191   if (get_max_loop_iterations (loop, &max))
10192     {
10193       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
10194       signop sgn = TYPE_SIGN (type);
10195       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
10196       if (max < type_max)
10197         return true;
10198     }
10199   return false;
10200 }
10201
10202 /* Return a mask type with half the number of elements as OLD_TYPE,
10203    given that it should have mode NEW_MODE.  */
10204
10205 tree
10206 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
10207 {
10208   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
10209   return build_truth_vector_type_for_mode (nunits, new_mode);
10210 }
10211
10212 /* Return a mask type with twice as many elements as OLD_TYPE,
10213    given that it should have mode NEW_MODE.  */
10214
10215 tree
10216 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
10217 {
10218   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
10219   return build_truth_vector_type_for_mode (nunits, new_mode);
10220 }
10221
10222 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
10223    contain a sequence of NVECTORS masks that each control a vector of type
10224    VECTYPE.  If SCALAR_MASK is nonnull, the fully-masked loop would AND
10225    these vector masks with the vector version of SCALAR_MASK.  */
10226
10227 void
10228 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
10229                        unsigned int nvectors, tree vectype, tree scalar_mask)
10230 {
10231   gcc_assert (nvectors != 0);
10232   if (masks->length () < nvectors)
10233     masks->safe_grow_cleared (nvectors, true);
10234   rgroup_controls *rgm = &(*masks)[nvectors - 1];
10235   /* The number of scalars per iteration and the number of vectors are
10236      both compile-time constants.  */
10237   unsigned int nscalars_per_iter
10238     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10239                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
10240
10241   if (scalar_mask)
10242     {
10243       scalar_cond_masked_key cond (scalar_mask, nvectors);
10244       loop_vinfo->scalar_cond_masked_set.add (cond);
10245     }
10246
10247   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
10248     {
10249       rgm->max_nscalars_per_iter = nscalars_per_iter;
10250       rgm->type = truth_type_for (vectype);
10251       rgm->factor = 1;
10252     }
10253 }
10254
10255 /* Given a complete set of masks MASKS, extract mask number INDEX
10256    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
10257    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
10258
10259    See the comment above vec_loop_masks for more details about the mask
10260    arrangement.  */
10261
10262 tree
10263 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
10264                     unsigned int nvectors, tree vectype, unsigned int index)
10265 {
10266   rgroup_controls *rgm = &(*masks)[nvectors - 1];
10267   tree mask_type = rgm->type;
10268
10269   /* Populate the rgroup's mask array, if this is the first time we've
10270      used it.  */
10271   if (rgm->controls.is_empty ())
10272     {
10273       rgm->controls.safe_grow_cleared (nvectors, true);
10274       for (unsigned int i = 0; i < nvectors; ++i)
10275         {
10276           tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
10277           /* Provide a dummy definition until the real one is available.  */
10278           SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
10279           rgm->controls[i] = mask;
10280         }
10281     }
10282
10283   tree mask = rgm->controls[index];
10284   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
10285                 TYPE_VECTOR_SUBPARTS (vectype)))
10286     {
10287       /* A loop mask for data type X can be reused for data type Y
10288          if X has N times more elements than Y and if Y's elements
10289          are N times bigger than X's.  In this case each sequence
10290          of N elements in the loop mask will be all-zero or all-one.
10291          We can then view-convert the mask so that each sequence of
10292          N elements is replaced by a single element.  */
10293       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
10294                               TYPE_VECTOR_SUBPARTS (vectype)));
10295       gimple_seq seq = NULL;
10296       mask_type = truth_type_for (vectype);
10297       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
10298       if (seq)
10299         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10300     }
10301   return mask;
10302 }
10303
10304 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
10305    lengths for controlling an operation on VECTYPE.  The operation splits
10306    each element of VECTYPE into FACTOR separate subelements, measuring the
10307    length as a number of these subelements.  */
10308
10309 void
10310 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
10311                       unsigned int nvectors, tree vectype, unsigned int factor)
10312 {
10313   gcc_assert (nvectors != 0);
10314   if (lens->length () < nvectors)
10315     lens->safe_grow_cleared (nvectors, true);
10316   rgroup_controls *rgl = &(*lens)[nvectors - 1];
10317
10318   /* The number of scalars per iteration, scalar occupied bytes and
10319      the number of vectors are both compile-time constants.  */
10320   unsigned int nscalars_per_iter
10321     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10322                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
10323
10324   if (rgl->max_nscalars_per_iter < nscalars_per_iter)
10325     {
10326       /* For now, we only support cases in which all loads and stores fall back
10327          to VnQI or none do.  */
10328       gcc_assert (!rgl->max_nscalars_per_iter
10329                   || (rgl->factor == 1 && factor == 1)
10330                   || (rgl->max_nscalars_per_iter * rgl->factor
10331                       == nscalars_per_iter * factor));
10332       rgl->max_nscalars_per_iter = nscalars_per_iter;
10333       rgl->type = vectype;
10334       rgl->factor = factor;
10335     }
10336 }
10337
10338 /* Given a complete set of length LENS, extract length number INDEX for an
10339    rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS.  */
10340
10341 tree
10342 vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
10343                    unsigned int nvectors, unsigned int index)
10344 {
10345   rgroup_controls *rgl = &(*lens)[nvectors - 1];
10346   bool use_bias_adjusted_len =
10347     LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
10348
10349   /* Populate the rgroup's len array, if this is the first time we've
10350      used it.  */
10351   if (rgl->controls.is_empty ())
10352     {
10353       rgl->controls.safe_grow_cleared (nvectors, true);
10354       for (unsigned int i = 0; i < nvectors; ++i)
10355         {
10356           tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
10357           gcc_assert (len_type != NULL_TREE);
10358
10359           tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
10360
10361           /* Provide a dummy definition until the real one is available.  */
10362           SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
10363           rgl->controls[i] = len;
10364
10365           if (use_bias_adjusted_len)
10366             {
10367               gcc_assert (i == 0);
10368               tree adjusted_len =
10369                 make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
10370               SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
10371               rgl->bias_adjusted_ctrl = adjusted_len;
10372             }
10373         }
10374     }
10375
10376   if (use_bias_adjusted_len)
10377     return rgl->bias_adjusted_ctrl;
10378   else
10379     return rgl->controls[index];
10380 }
10381
10382 /* Scale profiling counters by estimation for LOOP which is vectorized
10383    by factor VF.  */
10384
10385 static void
10386 scale_profile_for_vect_loop (class loop *loop, unsigned vf)
10387 {
10388   edge preheader = loop_preheader_edge (loop);
10389   /* Reduce loop iterations by the vectorization factor.  */
10390   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
10391   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
10392
10393   if (freq_h.nonzero_p ())
10394     {
10395       profile_probability p;
10396
10397       /* Avoid dropping loop body profile counter to 0 because of zero count
10398          in loop's preheader.  */
10399       if (!(freq_e == profile_count::zero ()))
10400         freq_e = freq_e.force_nonzero ();
10401       p = (freq_e * (new_est_niter + 1)).probability_in (freq_h);
10402       scale_loop_frequencies (loop, p);
10403     }
10404
10405   edge exit_e = single_exit (loop);
10406   exit_e->probability = profile_probability::always () / (new_est_niter + 1);
10407
10408   edge exit_l = single_pred_edge (loop->latch);
10409   profile_probability prob = exit_l->probability;
10410   exit_l->probability = exit_e->probability.invert ();
10411   if (prob.initialized_p () && exit_l->probability.initialized_p ())
10412     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
10413 }
10414
10415 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
10416    latch edge values originally defined by it.  */
10417
10418 static void
10419 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
10420                                      stmt_vec_info def_stmt_info)
10421 {
10422   tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
10423   if (!def || TREE_CODE (def) != SSA_NAME)
10424     return;
10425   stmt_vec_info phi_info;
10426   imm_use_iterator iter;
10427   use_operand_p use_p;
10428   FOR_EACH_IMM_USE_FAST (use_p, iter, def)
10429     {
10430       gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p));
10431       if (!phi)
10432         continue;
10433       if (!(gimple_bb (phi)->loop_father->header == gimple_bb (phi)
10434             && (phi_info = loop_vinfo->lookup_stmt (phi))
10435             && STMT_VINFO_RELEVANT_P (phi_info)))
10436         continue;
10437       loop_p loop = gimple_bb (phi)->loop_father;
10438       edge e = loop_latch_edge (loop);
10439       if (PHI_ARG_DEF_FROM_EDGE (phi, e) != def)
10440         continue;
10441
10442       if (VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
10443           && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
10444           && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
10445         {
10446           vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
10447           vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
10448           gcc_assert (phi_defs.length () == latch_defs.length ());
10449           for (unsigned i = 0; i < phi_defs.length (); ++i)
10450             add_phi_arg (as_a <gphi *> (phi_defs[i]),
10451                          gimple_get_lhs (latch_defs[i]), e,
10452                          gimple_phi_arg_location (phi, e->dest_idx));
10453         }
10454       else if (STMT_VINFO_DEF_TYPE (phi_info) == vect_first_order_recurrence)
10455         {
10456           /* For first order recurrences we have to update both uses of
10457              the latch definition, the one in the PHI node and the one
10458              in the generated VEC_PERM_EXPR.  */
10459           vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
10460           vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
10461           gcc_assert (phi_defs.length () == latch_defs.length ());
10462           tree phidef = gimple_assign_rhs1 (phi_defs[0]);
10463           gphi *vphi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
10464           for (unsigned i = 0; i < phi_defs.length (); ++i)
10465             {
10466               gassign *perm = as_a <gassign *> (phi_defs[i]);
10467               if (i > 0)
10468                 gimple_assign_set_rhs1 (perm, gimple_get_lhs (latch_defs[i-1]));
10469               gimple_assign_set_rhs2 (perm, gimple_get_lhs (latch_defs[i]));
10470               update_stmt (perm);
10471             }
10472           add_phi_arg (vphi, gimple_get_lhs (latch_defs.last ()), e,
10473                        gimple_phi_arg_location (phi, e->dest_idx));
10474         }
10475     }
10476 }
10477
10478 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
10479    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
10480    stmt_vec_info.  */
10481
10482 static bool
10483 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
10484                           gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
10485 {
10486   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10487   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
10488
10489   if (dump_enabled_p ())
10490     dump_printf_loc (MSG_NOTE, vect_location,
10491                      "------>vectorizing statement: %G", stmt_info->stmt);
10492
10493   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
10494     vect_loop_kill_debug_uses (loop, stmt_info);
10495
10496   if (!STMT_VINFO_RELEVANT_P (stmt_info)
10497       && !STMT_VINFO_LIVE_P (stmt_info))
10498     return false;
10499
10500   if (STMT_VINFO_VECTYPE (stmt_info))
10501     {
10502       poly_uint64 nunits
10503         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
10504       if (!STMT_SLP_TYPE (stmt_info)
10505           && maybe_ne (nunits, vf)
10506           && dump_enabled_p ())
10507         /* For SLP VF is set according to unrolling factor, and not
10508            to vector size, hence for SLP this print is not valid.  */
10509         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
10510     }
10511
10512   /* Pure SLP statements have already been vectorized.  We still need
10513      to apply loop vectorization to hybrid SLP statements.  */
10514   if (PURE_SLP_STMT (stmt_info))
10515     return false;
10516
10517   if (dump_enabled_p ())
10518     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
10519
10520   if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
10521     *seen_store = stmt_info;
10522
10523   return true;
10524 }
10525
10526 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
10527    in the hash_map with its corresponding values.  */
10528
10529 static tree
10530 find_in_mapping (tree t, void *context)
10531 {
10532   hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
10533
10534   tree *value = mapping->get (t);
10535   return value ? *value : t;
10536 }
10537
10538 /* Update EPILOGUE's loop_vec_info.  EPILOGUE was constructed as a copy of the
10539    original loop that has now been vectorized.
10540
10541    The inits of the data_references need to be advanced with the number of
10542    iterations of the main loop.  This has been computed in vect_do_peeling and
10543    is stored in parameter ADVANCE.  We first restore the data_references
10544    initial offset with the values recored in ORIG_DRS_INIT.
10545
10546    Since the loop_vec_info of this EPILOGUE was constructed for the original
10547    loop, its stmt_vec_infos all point to the original statements.  These need
10548    to be updated to point to their corresponding copies as well as the SSA_NAMES
10549    in their PATTERN_DEF_SEQs and RELATED_STMTs.
10550
10551    The data_reference's connections also need to be updated.  Their
10552    corresponding dr_vec_info need to be reconnected to the EPILOGUE's
10553    stmt_vec_infos, their statements need to point to their corresponding copy,
10554    if they are gather loads or scatter stores then their reference needs to be
10555    updated to point to its corresponding copy and finally we set
10556    'base_misaligned' to false as we have already peeled for alignment in the
10557    prologue of the main loop.  */
10558
10559 static void
10560 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
10561 {
10562   loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
10563   auto_vec<gimple *> stmt_worklist;
10564   hash_map<tree,tree> mapping;
10565   gimple *orig_stmt, *new_stmt;
10566   gimple_stmt_iterator epilogue_gsi;
10567   gphi_iterator epilogue_phi_gsi;
10568   stmt_vec_info stmt_vinfo = NULL, related_vinfo;
10569   basic_block *epilogue_bbs = get_loop_body (epilogue);
10570   unsigned i;
10571
10572   free (LOOP_VINFO_BBS (epilogue_vinfo));
10573   LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
10574
10575   /* Advance data_reference's with the number of iterations of the previous
10576      loop and its prologue.  */
10577   vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
10578
10579
10580   /* The EPILOGUE loop is a copy of the original loop so they share the same
10581      gimple UIDs.  In this loop we update the loop_vec_info of the EPILOGUE to
10582      point to the copied statements.  We also create a mapping of all LHS' in
10583      the original loop and all the LHS' in the EPILOGUE and create worklists to
10584      update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs.  */
10585   for (unsigned i = 0; i < epilogue->num_nodes; ++i)
10586     {
10587       for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
10588            !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
10589         {
10590           new_stmt = epilogue_phi_gsi.phi ();
10591
10592           gcc_assert (gimple_uid (new_stmt) > 0);
10593           stmt_vinfo
10594             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
10595
10596           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
10597           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
10598
10599           mapping.put (gimple_phi_result (orig_stmt),
10600                        gimple_phi_result (new_stmt));
10601           /* PHI nodes can not have patterns or related statements.  */
10602           gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
10603                       && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
10604         }
10605
10606       for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
10607            !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
10608         {
10609           new_stmt = gsi_stmt (epilogue_gsi);
10610           if (is_gimple_debug (new_stmt))
10611             continue;
10612
10613           gcc_assert (gimple_uid (new_stmt) > 0);
10614           stmt_vinfo
10615             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
10616
10617           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
10618           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
10619
10620           if (tree old_lhs = gimple_get_lhs (orig_stmt))
10621             mapping.put (old_lhs, gimple_get_lhs (new_stmt));
10622
10623           if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
10624             {
10625               gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
10626               for (gimple_stmt_iterator gsi = gsi_start (seq);
10627                    !gsi_end_p (gsi); gsi_next (&gsi))
10628                 stmt_worklist.safe_push (gsi_stmt (gsi));
10629             }
10630
10631           related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
10632           if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
10633             {
10634               gimple *stmt = STMT_VINFO_STMT (related_vinfo);
10635               stmt_worklist.safe_push (stmt);
10636               /* Set BB such that the assert in
10637                 'get_initial_def_for_reduction' is able to determine that
10638                 the BB of the related stmt is inside this loop.  */
10639               gimple_set_bb (stmt,
10640                              gimple_bb (new_stmt));
10641               related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
10642               gcc_assert (related_vinfo == NULL
10643                           || related_vinfo == stmt_vinfo);
10644             }
10645         }
10646     }
10647
10648   /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
10649      using the original main loop and thus need to be updated to refer to the
10650      cloned variables used in the epilogue.  */
10651   for (unsigned i = 0; i < stmt_worklist.length (); ++i)
10652     {
10653       gimple *stmt = stmt_worklist[i];
10654       tree *new_op;
10655
10656       for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
10657         {
10658           tree op = gimple_op (stmt, j);
10659           if ((new_op = mapping.get(op)))
10660             gimple_set_op (stmt, j, *new_op);
10661           else
10662             {
10663               /* PR92429: The last argument of simplify_replace_tree disables
10664                  folding when replacing arguments.  This is required as
10665                  otherwise you might end up with different statements than the
10666                  ones analyzed in vect_loop_analyze, leading to different
10667                  vectorization.  */
10668               op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
10669                                           &find_in_mapping, &mapping, false);
10670               gimple_set_op (stmt, j, op);
10671             }
10672         }
10673     }
10674
10675   struct data_reference *dr;
10676   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
10677   FOR_EACH_VEC_ELT (datarefs, i, dr)
10678     {
10679       orig_stmt = DR_STMT (dr);
10680       gcc_assert (gimple_uid (orig_stmt) > 0);
10681       stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
10682       /* Data references for gather loads and scatter stores do not use the
10683          updated offset we set using ADVANCE.  Instead we have to make sure the
10684          reference in the data references point to the corresponding copy of
10685          the original in the epilogue.  */
10686       if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
10687           == VMAT_GATHER_SCATTER)
10688         {
10689           DR_REF (dr)
10690             = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
10691                                      &find_in_mapping, &mapping);
10692           DR_BASE_ADDRESS (dr)
10693             = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
10694                                      &find_in_mapping, &mapping);
10695         }
10696       DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
10697       stmt_vinfo->dr_aux.stmt = stmt_vinfo;
10698       /* The vector size of the epilogue is smaller than that of the main loop
10699          so the alignment is either the same or lower. This means the dr will
10700          thus by definition be aligned.  */
10701       STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
10702     }
10703
10704   epilogue_vinfo->shared->datarefs_copy.release ();
10705   epilogue_vinfo->shared->save_datarefs ();
10706 }
10707
10708 /* Function vect_transform_loop.
10709
10710    The analysis phase has determined that the loop is vectorizable.
10711    Vectorize the loop - created vectorized stmts to replace the scalar
10712    stmts in the loop, and update the loop exit condition.
10713    Returns scalar epilogue loop if any.  */
10714
10715 class loop *
10716 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
10717 {
10718   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10719   class loop *epilogue = NULL;
10720   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
10721   int nbbs = loop->num_nodes;
10722   int i;
10723   tree niters_vector = NULL_TREE;
10724   tree step_vector = NULL_TREE;
10725   tree niters_vector_mult_vf = NULL_TREE;
10726   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
10727   unsigned int lowest_vf = constant_lower_bound (vf);
10728   gimple *stmt;
10729   bool check_profitability = false;
10730   unsigned int th;
10731
10732   DUMP_VECT_SCOPE ("vec_transform_loop");
10733
10734   loop_vinfo->shared->check_datarefs ();
10735
10736   /* Use the more conservative vectorization threshold.  If the number
10737      of iterations is constant assume the cost check has been performed
10738      by our caller.  If the threshold makes all loops profitable that
10739      run at least the (estimated) vectorization factor number of times
10740      checking is pointless, too.  */
10741   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
10742   if (vect_apply_runtime_profitability_check_p (loop_vinfo))
10743     {
10744       if (dump_enabled_p ())
10745         dump_printf_loc (MSG_NOTE, vect_location,
10746                          "Profitability threshold is %d loop iterations.\n",
10747                          th);
10748       check_profitability = true;
10749     }
10750
10751   /* Make sure there exists a single-predecessor exit bb.  Do this before
10752      versioning.   */
10753   edge e = single_exit (loop);
10754   if (! single_pred_p (e->dest))
10755     {
10756       split_loop_exit_edge (e, true);
10757       if (dump_enabled_p ())
10758         dump_printf (MSG_NOTE, "split exit edge\n");
10759     }
10760
10761   /* Version the loop first, if required, so the profitability check
10762      comes first.  */
10763
10764   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
10765     {
10766       class loop *sloop
10767         = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
10768       sloop->force_vectorize = false;
10769       check_profitability = false;
10770     }
10771
10772   /* Make sure there exists a single-predecessor exit bb also on the
10773      scalar loop copy.  Do this after versioning but before peeling
10774      so CFG structure is fine for both scalar and if-converted loop
10775      to make slpeel_duplicate_current_defs_from_edges face matched
10776      loop closed PHI nodes on the exit.  */
10777   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
10778     {
10779       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
10780       if (! single_pred_p (e->dest))
10781         {
10782           split_loop_exit_edge (e, true);
10783           if (dump_enabled_p ())
10784             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
10785         }
10786     }
10787
10788   tree niters = vect_build_loop_niters (loop_vinfo);
10789   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
10790   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
10791   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
10792   tree advance;
10793   drs_init_vec orig_drs_init;
10794
10795   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
10796                               &step_vector, &niters_vector_mult_vf, th,
10797                               check_profitability, niters_no_overflow,
10798                               &advance);
10799
10800   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
10801       && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
10802     scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
10803                             LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
10804
10805   if (niters_vector == NULL_TREE)
10806     {
10807       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
10808           && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
10809           && known_eq (lowest_vf, vf))
10810         {
10811           niters_vector
10812             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
10813                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
10814           step_vector = build_one_cst (TREE_TYPE (niters));
10815         }
10816       else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
10817         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
10818                                      &step_vector, niters_no_overflow);
10819       else
10820         /* vect_do_peeling subtracted the number of peeled prologue
10821            iterations from LOOP_VINFO_NITERS.  */
10822         vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
10823                                      &niters_vector, &step_vector,
10824                                      niters_no_overflow);
10825     }
10826
10827   /* 1) Make sure the loop header has exactly two entries
10828      2) Make sure we have a preheader basic block.  */
10829
10830   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
10831
10832   split_edge (loop_preheader_edge (loop));
10833
10834   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
10835     /* This will deal with any possible peeling.  */
10836     vect_prepare_for_masked_peels (loop_vinfo);
10837
10838   /* Schedule the SLP instances first, then handle loop vectorization
10839      below.  */
10840   if (!loop_vinfo->slp_instances.is_empty ())
10841     {
10842       DUMP_VECT_SCOPE ("scheduling SLP instances");
10843       vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
10844     }
10845
10846   /* FORNOW: the vectorizer supports only loops which body consist
10847      of one basic block (header + empty latch). When the vectorizer will
10848      support more involved loop forms, the order by which the BBs are
10849      traversed need to be reconsidered.  */
10850
10851   for (i = 0; i < nbbs; i++)
10852     {
10853       basic_block bb = bbs[i];
10854       stmt_vec_info stmt_info;
10855
10856       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
10857            gsi_next (&si))
10858         {
10859           gphi *phi = si.phi ();
10860           if (dump_enabled_p ())
10861             dump_printf_loc (MSG_NOTE, vect_location,
10862                              "------>vectorizing phi: %G", (gimple *) phi);
10863           stmt_info = loop_vinfo->lookup_stmt (phi);
10864           if (!stmt_info)
10865             continue;
10866
10867           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
10868             vect_loop_kill_debug_uses (loop, stmt_info);
10869
10870           if (!STMT_VINFO_RELEVANT_P (stmt_info)
10871               && !STMT_VINFO_LIVE_P (stmt_info))
10872             continue;
10873
10874           if (STMT_VINFO_VECTYPE (stmt_info)
10875               && (maybe_ne
10876                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
10877               && dump_enabled_p ())
10878             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
10879
10880           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
10881                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
10882                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
10883                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
10884                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence
10885                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
10886               && ! PURE_SLP_STMT (stmt_info))
10887             {
10888               if (dump_enabled_p ())
10889                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
10890               vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
10891             }
10892         }
10893
10894       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
10895            gsi_next (&si))
10896         {
10897           gphi *phi = si.phi ();
10898           stmt_info = loop_vinfo->lookup_stmt (phi);
10899           if (!stmt_info)
10900             continue;
10901
10902           if (!STMT_VINFO_RELEVANT_P (stmt_info)
10903               && !STMT_VINFO_LIVE_P (stmt_info))
10904             continue;
10905
10906           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
10907                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
10908                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
10909                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
10910                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
10911                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence)
10912               && ! PURE_SLP_STMT (stmt_info))
10913             maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
10914         }
10915
10916       for (gimple_stmt_iterator si = gsi_start_bb (bb);
10917            !gsi_end_p (si);)
10918         {
10919           stmt = gsi_stmt (si);
10920           /* During vectorization remove existing clobber stmts.  */
10921           if (gimple_clobber_p (stmt))
10922             {
10923               unlink_stmt_vdef (stmt);
10924               gsi_remove (&si, true);
10925               release_defs (stmt);
10926             }
10927           else
10928             {
10929               /* Ignore vector stmts created in the outer loop.  */
10930               stmt_info = loop_vinfo->lookup_stmt (stmt);
10931
10932               /* vector stmts created in the outer-loop during vectorization of
10933                  stmts in an inner-loop may not have a stmt_info, and do not
10934                  need to be vectorized.  */
10935               stmt_vec_info seen_store = NULL;
10936               if (stmt_info)
10937                 {
10938                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
10939                     {
10940                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
10941                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
10942                            !gsi_end_p (subsi); gsi_next (&subsi))
10943                         {
10944                           stmt_vec_info pat_stmt_info
10945                             = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
10946                           vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
10947                                                     &si, &seen_store);
10948                         }
10949                       stmt_vec_info pat_stmt_info
10950                         = STMT_VINFO_RELATED_STMT (stmt_info);
10951                       if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
10952                                                     &si, &seen_store))
10953                         maybe_set_vectorized_backedge_value (loop_vinfo,
10954                                                              pat_stmt_info);
10955                     }
10956                   else
10957                     {
10958                       if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
10959                                                     &seen_store))
10960                         maybe_set_vectorized_backedge_value (loop_vinfo,
10961                                                              stmt_info);
10962                     }
10963                 }
10964               gsi_next (&si);
10965               if (seen_store)
10966                 {
10967                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
10968                     /* Interleaving.  If IS_STORE is TRUE, the
10969                        vectorization of the interleaving chain was
10970                        completed - free all the stores in the chain.  */
10971                     vect_remove_stores (loop_vinfo,
10972                                         DR_GROUP_FIRST_ELEMENT (seen_store));
10973                   else
10974                     /* Free the attached stmt_vec_info and remove the stmt.  */
10975                     loop_vinfo->remove_stmt (stmt_info);
10976                 }
10977             }
10978         }
10979
10980       /* Stub out scalar statements that must not survive vectorization.
10981          Doing this here helps with grouped statements, or statements that
10982          are involved in patterns.  */
10983       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
10984            !gsi_end_p (gsi); gsi_next (&gsi))
10985         {
10986           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
10987           if (!call || !gimple_call_internal_p (call))
10988             continue;
10989           internal_fn ifn = gimple_call_internal_fn (call);
10990           if (ifn == IFN_MASK_LOAD)
10991             {
10992               tree lhs = gimple_get_lhs (call);
10993               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
10994                 {
10995                   tree zero = build_zero_cst (TREE_TYPE (lhs));
10996                   gimple *new_stmt = gimple_build_assign (lhs, zero);
10997                   gsi_replace (&gsi, new_stmt, true);
10998                 }
10999             }
11000           else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
11001             {
11002               tree lhs = gimple_get_lhs (call);
11003               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11004                 {
11005                   tree else_arg
11006                     = gimple_call_arg (call, gimple_call_num_args (call) - 1);
11007                   gimple *new_stmt = gimple_build_assign (lhs, else_arg);
11008                   gsi_replace (&gsi, new_stmt, true);
11009                 }
11010             }
11011         }
11012     }                           /* BBs in loop */
11013
11014   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
11015      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
11016   if (integer_onep (step_vector))
11017     niters_no_overflow = true;
11018   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
11019                            niters_vector_mult_vf, !niters_no_overflow);
11020
11021   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
11022   scale_profile_for_vect_loop (loop, assumed_vf);
11023
11024   /* True if the final iteration might not handle a full vector's
11025      worth of scalar iterations.  */
11026   bool final_iter_may_be_partial
11027     = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
11028   /* The minimum number of iterations performed by the epilogue.  This
11029      is 1 when peeling for gaps because we always need a final scalar
11030      iteration.  */
11031   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
11032   /* +1 to convert latch counts to loop iteration counts,
11033      -min_epilogue_iters to remove iterations that cannot be performed
11034        by the vector code.  */
11035   int bias_for_lowest = 1 - min_epilogue_iters;
11036   int bias_for_assumed = bias_for_lowest;
11037   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
11038   if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
11039     {
11040       /* When the amount of peeling is known at compile time, the first
11041          iteration will have exactly alignment_npeels active elements.
11042          In the worst case it will have at least one.  */
11043       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
11044       bias_for_lowest += lowest_vf - min_first_active;
11045       bias_for_assumed += assumed_vf - min_first_active;
11046     }
11047   /* In these calculations the "- 1" converts loop iteration counts
11048      back to latch counts.  */
11049   if (loop->any_upper_bound)
11050     {
11051       loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
11052       loop->nb_iterations_upper_bound
11053         = (final_iter_may_be_partial
11054            ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
11055                             lowest_vf) - 1
11056            : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
11057                              lowest_vf) - 1);
11058       if (main_vinfo
11059           /* Both peeling for alignment and peeling for gaps can end up
11060              with the scalar epilogue running for more than VF-1 iterations.  */
11061           && !main_vinfo->peeling_for_alignment
11062           && !main_vinfo->peeling_for_gaps)
11063         {
11064           unsigned int bound;
11065           poly_uint64 main_iters
11066             = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
11067                            LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
11068           main_iters
11069             = upper_bound (main_iters,
11070                            LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
11071           if (can_div_away_from_zero_p (main_iters,
11072                                         LOOP_VINFO_VECT_FACTOR (loop_vinfo),
11073                                         &bound))
11074             loop->nb_iterations_upper_bound
11075               = wi::umin ((widest_int) (bound - 1),
11076                           loop->nb_iterations_upper_bound);
11077       }
11078   }
11079   if (loop->any_likely_upper_bound)
11080     loop->nb_iterations_likely_upper_bound
11081       = (final_iter_may_be_partial
11082          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
11083                           + bias_for_lowest, lowest_vf) - 1
11084          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
11085                            + bias_for_lowest, lowest_vf) - 1);
11086   if (loop->any_estimate)
11087     loop->nb_iterations_estimate
11088       = (final_iter_may_be_partial
11089          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
11090                           assumed_vf) - 1
11091          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
11092                            assumed_vf) - 1);
11093
11094   if (dump_enabled_p ())
11095     {
11096       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
11097         {
11098           dump_printf_loc (MSG_NOTE, vect_location,
11099                            "LOOP VECTORIZED\n");
11100           if (loop->inner)
11101             dump_printf_loc (MSG_NOTE, vect_location,
11102                              "OUTER LOOP VECTORIZED\n");
11103           dump_printf (MSG_NOTE, "\n");
11104         }
11105       else
11106         dump_printf_loc (MSG_NOTE, vect_location,
11107                          "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
11108                          GET_MODE_NAME (loop_vinfo->vector_mode));
11109     }
11110
11111   /* Loops vectorized with a variable factor won't benefit from
11112      unrolling/peeling.  */
11113   if (!vf.is_constant ())
11114     {
11115       loop->unroll = 1;
11116       if (dump_enabled_p ())
11117         dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
11118                          " variable-length vectorization factor\n");
11119     }
11120   /* Free SLP instances here because otherwise stmt reference counting
11121      won't work.  */
11122   slp_instance instance;
11123   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
11124     vect_free_slp_instance (instance);
11125   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
11126   /* Clear-up safelen field since its value is invalid after vectorization
11127      since vectorized loop can have loop-carried dependencies.  */
11128   loop->safelen = 0;
11129
11130   if (epilogue)
11131     {
11132       update_epilogue_loop_vinfo (epilogue, advance);
11133
11134       epilogue->simduid = loop->simduid;
11135       epilogue->force_vectorize = loop->force_vectorize;
11136       epilogue->dont_vectorize = false;
11137     }
11138
11139   return epilogue;
11140 }
11141
11142 /* The code below is trying to perform simple optimization - revert
11143    if-conversion for masked stores, i.e. if the mask of a store is zero
11144    do not perform it and all stored value producers also if possible.
11145    For example,
11146      for (i=0; i<n; i++)
11147        if (c[i])
11148         {
11149           p1[i] += 1;
11150           p2[i] = p3[i] +2;
11151         }
11152    this transformation will produce the following semi-hammock:
11153
11154    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
11155      {
11156        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
11157        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
11158        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
11159        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
11160        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
11161        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
11162      }
11163 */
11164
11165 void
11166 optimize_mask_stores (class loop *loop)
11167 {
11168   basic_block *bbs = get_loop_body (loop);
11169   unsigned nbbs = loop->num_nodes;
11170   unsigned i;
11171   basic_block bb;
11172   class loop *bb_loop;
11173   gimple_stmt_iterator gsi;
11174   gimple *stmt;
11175   auto_vec<gimple *> worklist;
11176   auto_purge_vect_location sentinel;
11177
11178   vect_location = find_loop_location (loop);
11179   /* Pick up all masked stores in loop if any.  */
11180   for (i = 0; i < nbbs; i++)
11181     {
11182       bb = bbs[i];
11183       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
11184            gsi_next (&gsi))
11185         {
11186           stmt = gsi_stmt (gsi);
11187           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
11188             worklist.safe_push (stmt);
11189         }
11190     }
11191
11192   free (bbs);
11193   if (worklist.is_empty ())
11194     return;
11195
11196   /* Loop has masked stores.  */
11197   while (!worklist.is_empty ())
11198     {
11199       gimple *last, *last_store;
11200       edge e, efalse;
11201       tree mask;
11202       basic_block store_bb, join_bb;
11203       gimple_stmt_iterator gsi_to;
11204       tree vdef, new_vdef;
11205       gphi *phi;
11206       tree vectype;
11207       tree zero;
11208
11209       last = worklist.pop ();
11210       mask = gimple_call_arg (last, 2);
11211       bb = gimple_bb (last);
11212       /* Create then_bb and if-then structure in CFG, then_bb belongs to
11213          the same loop as if_bb.  It could be different to LOOP when two
11214          level loop-nest is vectorized and mask_store belongs to the inner
11215          one.  */
11216       e = split_block (bb, last);
11217       bb_loop = bb->loop_father;
11218       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
11219       join_bb = e->dest;
11220       store_bb = create_empty_bb (bb);
11221       add_bb_to_loop (store_bb, bb_loop);
11222       e->flags = EDGE_TRUE_VALUE;
11223       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
11224       /* Put STORE_BB to likely part.  */
11225       efalse->probability = profile_probability::unlikely ();
11226       store_bb->count = efalse->count ();
11227       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
11228       if (dom_info_available_p (CDI_DOMINATORS))
11229         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
11230       if (dump_enabled_p ())
11231         dump_printf_loc (MSG_NOTE, vect_location,
11232                          "Create new block %d to sink mask stores.",
11233                          store_bb->index);
11234       /* Create vector comparison with boolean result.  */
11235       vectype = TREE_TYPE (mask);
11236       zero = build_zero_cst (vectype);
11237       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
11238       gsi = gsi_last_bb (bb);
11239       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
11240       /* Create new PHI node for vdef of the last masked store:
11241          .MEM_2 = VDEF <.MEM_1>
11242          will be converted to
11243          .MEM.3 = VDEF <.MEM_1>
11244          and new PHI node will be created in join bb
11245          .MEM_2 = PHI <.MEM_1, .MEM_3>
11246       */
11247       vdef = gimple_vdef (last);
11248       new_vdef = make_ssa_name (gimple_vop (cfun), last);
11249       gimple_set_vdef (last, new_vdef);
11250       phi = create_phi_node (vdef, join_bb);
11251       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
11252
11253       /* Put all masked stores with the same mask to STORE_BB if possible.  */
11254       while (true)
11255         {
11256           gimple_stmt_iterator gsi_from;
11257           gimple *stmt1 = NULL;
11258
11259           /* Move masked store to STORE_BB.  */
11260           last_store = last;
11261           gsi = gsi_for_stmt (last);
11262           gsi_from = gsi;
11263           /* Shift GSI to the previous stmt for further traversal.  */
11264           gsi_prev (&gsi);
11265           gsi_to = gsi_start_bb (store_bb);
11266           gsi_move_before (&gsi_from, &gsi_to);
11267           /* Setup GSI_TO to the non-empty block start.  */
11268           gsi_to = gsi_start_bb (store_bb);
11269           if (dump_enabled_p ())
11270             dump_printf_loc (MSG_NOTE, vect_location,
11271                              "Move stmt to created bb\n%G", last);
11272           /* Move all stored value producers if possible.  */
11273           while (!gsi_end_p (gsi))
11274             {
11275               tree lhs;
11276               imm_use_iterator imm_iter;
11277               use_operand_p use_p;
11278               bool res;
11279
11280               /* Skip debug statements.  */
11281               if (is_gimple_debug (gsi_stmt (gsi)))
11282                 {
11283                   gsi_prev (&gsi);
11284                   continue;
11285                 }
11286               stmt1 = gsi_stmt (gsi);
11287               /* Do not consider statements writing to memory or having
11288                  volatile operand.  */
11289               if (gimple_vdef (stmt1)
11290                   || gimple_has_volatile_ops (stmt1))
11291                 break;
11292               gsi_from = gsi;
11293               gsi_prev (&gsi);
11294               lhs = gimple_get_lhs (stmt1);
11295               if (!lhs)
11296                 break;
11297
11298               /* LHS of vectorized stmt must be SSA_NAME.  */
11299               if (TREE_CODE (lhs) != SSA_NAME)
11300                 break;
11301
11302               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11303                 {
11304                   /* Remove dead scalar statement.  */
11305                   if (has_zero_uses (lhs))
11306                     {
11307                       gsi_remove (&gsi_from, true);
11308                       continue;
11309                     }
11310                 }
11311
11312               /* Check that LHS does not have uses outside of STORE_BB.  */
11313               res = true;
11314               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
11315                 {
11316                   gimple *use_stmt;
11317                   use_stmt = USE_STMT (use_p);
11318                   if (is_gimple_debug (use_stmt))
11319                     continue;
11320                   if (gimple_bb (use_stmt) != store_bb)
11321                     {
11322                       res = false;
11323                       break;
11324                     }
11325                 }
11326               if (!res)
11327                 break;
11328
11329               if (gimple_vuse (stmt1)
11330                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
11331                 break;
11332
11333               /* Can move STMT1 to STORE_BB.  */
11334               if (dump_enabled_p ())
11335                 dump_printf_loc (MSG_NOTE, vect_location,
11336                                  "Move stmt to created bb\n%G", stmt1);
11337               gsi_move_before (&gsi_from, &gsi_to);
11338               /* Shift GSI_TO for further insertion.  */
11339               gsi_prev (&gsi_to);
11340             }
11341           /* Put other masked stores with the same mask to STORE_BB.  */
11342           if (worklist.is_empty ()
11343               || gimple_call_arg (worklist.last (), 2) != mask
11344               || worklist.last () != stmt1)
11345             break;
11346           last = worklist.pop ();
11347         }
11348       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
11349     }
11350 }
11351
11352 /* Decide whether it is possible to use a zero-based induction variable
11353    when vectorizing LOOP_VINFO with partial vectors.  If it is, return
11354    the value that the induction variable must be able to hold in order
11355    to ensure that the rgroups eventually have no active vector elements.
11356    Return -1 otherwise.  */
11357
11358 widest_int
11359 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
11360 {
11361   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
11362   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11363   unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
11364
11365   /* Calculate the value that the induction variable must be able
11366      to hit in order to ensure that we end the loop with an all-false mask.
11367      This involves adding the maximum number of inactive trailing scalar
11368      iterations.  */
11369   widest_int iv_limit = -1;
11370   if (max_loop_iterations (loop, &iv_limit))
11371     {
11372       if (niters_skip)
11373         {
11374           /* Add the maximum number of skipped iterations to the
11375              maximum iteration count.  */
11376           if (TREE_CODE (niters_skip) == INTEGER_CST)
11377             iv_limit += wi::to_widest (niters_skip);
11378           else
11379             iv_limit += max_vf - 1;
11380         }
11381       else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
11382         /* Make a conservatively-correct assumption.  */
11383         iv_limit += max_vf - 1;
11384
11385       /* IV_LIMIT is the maximum number of latch iterations, which is also
11386          the maximum in-range IV value.  Round this value down to the previous
11387          vector alignment boundary and then add an extra full iteration.  */
11388       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11389       iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
11390     }
11391   return iv_limit;
11392 }
11393
11394 /* For the given rgroup_controls RGC, check whether an induction variable
11395    would ever hit a value that produces a set of all-false masks or zero
11396    lengths before wrapping around.  Return true if it's possible to wrap
11397    around before hitting the desirable value, otherwise return false.  */
11398
11399 bool
11400 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
11401 {
11402   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
11403
11404   if (iv_limit == -1)
11405     return true;
11406
11407   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
11408   unsigned int compare_precision = TYPE_PRECISION (compare_type);
11409   unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
11410
11411   if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
11412     return true;
11413
11414   return false;
11415 }