gcc/tree-vect-loop.cc

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2022 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #define INCLUDE_ALGORITHM
  23 #include "config.h"
  24 #include "system.h"
  25 #include "coretypes.h"
  26 #include "backend.h"
  27 #include "target.h"
  28 #include "rtl.h"
  29 #include "tree.h"
  30 #include "gimple.h"
  31 #include "cfghooks.h"
  32 #include "tree-pass.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "diagnostic-core.h"
  36 #include "fold-const.h"
  37 #include "stor-layout.h"
  38 #include "cfganal.h"
  39 #include "gimplify.h"
  40 #include "gimple-iterator.h"
  41 #include "gimplify-me.h"
  42 #include "tree-ssa-loop-ivopts.h"
  43 #include "tree-ssa-loop-manip.h"
  44 #include "tree-ssa-loop-niter.h"
  45 #include "tree-ssa-loop.h"
  46 #include "cfgloop.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52 #include "tree-if-conv.h"
  53 #include "internal-fn.h"
  54 #include "tree-vector-builder.h"
  55 #include "vec-perm-indices.h"
  56 #include "tree-eh.h"
  57 #include "case-cfn-macros.h"
  58
  59 /* Loop Vectorization Pass.
  60
  61    This pass tries to vectorize loops.
  62
  63    For example, the vectorizer transforms the following simple loop:
  64
  65         short a[N]; short b[N]; short c[N]; int i;
  66
  67         for (i=0; i<N; i++){
  68           a[i] = b[i] + c[i];
  69         }
  70
  71    as if it was manually vectorized by rewriting the source code into:
  72
  73         typedef int __attribute__((mode(V8HI))) v8hi;
  74         short a[N];  short b[N]; short c[N];   int i;
  75         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  76         v8hi va, vb, vc;
  77
  78         for (i=0; i<N/8; i++){
  79           vb = pb[i];
  80           vc = pc[i];
  81           va = vb + vc;
  82           pa[i] = va;
  83         }
  84
  85         The main entry to this pass is vectorize_loops(), in which
  86    the vectorizer applies a set of analyses on a given set of loops,
  87    followed by the actual vectorization transformation for the loops that
  88    had successfully passed the analysis phase.
  89         Throughout this pass we make a distinction between two types of
  90    data: scalars (which are represented by SSA_NAMES), and memory references
  91    ("data-refs").  These two types of data require different handling both
  92    during analysis and transformation. The types of data-refs that the
  93    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  94    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  95    accesses are required to have a simple (consecutive) access pattern.
  96
  97    Analysis phase:
  98    ===============
  99         The driver for the analysis phase is vect_analyze_loop().
 100    It applies a set of analyses, some of which rely on the scalar evolution
 101    analyzer (scev) developed by Sebastian Pop.
 102
 103         During the analysis phase the vectorizer records some information
 104    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 105    loop, as well as general information about the loop as a whole, which is
 106    recorded in a "loop_vec_info" struct attached to each loop.
 107
 108    Transformation phase:
 109    =====================
 110         The loop transformation phase scans all the stmts in the loop, and
 111    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 112    the loop that needs to be vectorized.  It inserts the vector code sequence
 113    just before the scalar stmt S, and records a pointer to the vector code
 114    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 115    attached to S).  This pointer will be used for the vectorization of following
 116    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 117    otherwise, we rely on dead code elimination for removing it.
 118
 119         For example, say stmt S1 was vectorized into stmt VS1:
 120
 121    VS1: vb = px[i];
 122    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 123    S2:  a = b;
 124
 125    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 126    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 127    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 128    resulting sequence would be:
 129
 130    VS1: vb = px[i];
 131    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 132    VS2: va = vb;
 133    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 134
 135         Operands that are not SSA_NAMEs, are data-refs that appear in
 136    load/store operations (like 'x[i]' in S1), and are handled differently.
 137
 138    Target modeling:
 139    =================
 140         Currently the only target specific information that is used is the
 141    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 142    Targets that can support different sizes of vectors, for now will need
 143    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 144    flexibility will be added in the future.
 145
 146         Since we only vectorize operations which vector form can be
 147    expressed using existing tree codes, to verify that an operation is
 148    supported, the vectorizer checks the relevant optab at the relevant
 149    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 150    the value found is CODE_FOR_nothing, then there's no target support, and
 151    we can't vectorize the stmt.
 152
 153    For additional information on this project see:
 154    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 155 */
 156
 157 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
 158                                                 unsigned *);
 159 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
 160                                                bool *, bool *, bool);
 161
 162 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 163    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 164    may already be set for general statements (not just data refs).  */
 165
 166 static opt_result
 167 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
 168                               bool vectype_maybe_set_p,
 169                               poly_uint64 *vf)
 170 {
 171   gimple *stmt = stmt_info->stmt;
 172
 173   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 174        && !STMT_VINFO_LIVE_P (stmt_info))
 175       || gimple_clobber_p (stmt))
 176     {
 177       if (dump_enabled_p ())
 178         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 179       return opt_result::success ();
 180     }
 181
 182   tree stmt_vectype, nunits_vectype;
 183   opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
 184                                                    &stmt_vectype,
 185                                                    &nunits_vectype);
 186   if (!res)
 187     return res;
 188
 189   if (stmt_vectype)
 190     {
 191       if (STMT_VINFO_VECTYPE (stmt_info))
 192         /* The only case when a vectype had been already set is for stmts
 193            that contain a data ref, or for "pattern-stmts" (stmts generated
 194            by the vectorizer to represent/replace a certain idiom).  */
 195         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 196                      || vectype_maybe_set_p)
 197                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 198       else
 199         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 200     }
 201
 202   if (nunits_vectype)
 203     vect_update_max_nunits (vf, nunits_vectype);
 204
 205   return opt_result::success ();
 206 }
 207
 208 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 209    types of STMT_INFO and all attached pattern statements and update
 210    the vectorization factor VF accordingly.  Return true on success
 211    or false if something prevented vectorization.  */
 212
 213 static opt_result
 214 vect_determine_vf_for_stmt (vec_info *vinfo,
 215                             stmt_vec_info stmt_info, poly_uint64 *vf)
 216 {
 217   if (dump_enabled_p ())
 218     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
 219                      stmt_info->stmt);
 220   opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
 221   if (!res)
 222     return res;
 223
 224   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 225       && STMT_VINFO_RELATED_STMT (stmt_info))
 226     {
 227       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 228       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
 229
 230       /* If a pattern statement has def stmts, analyze them too.  */
 231       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 232            !gsi_end_p (si); gsi_next (&si))
 233         {
 234           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 235           if (dump_enabled_p ())
 236             dump_printf_loc (MSG_NOTE, vect_location,
 237                              "==> examining pattern def stmt: %G",
 238                              def_stmt_info->stmt);
 239           res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
 240           if (!res)
 241             return res;
 242         }
 243
 244       if (dump_enabled_p ())
 245         dump_printf_loc (MSG_NOTE, vect_location,
 246                          "==> examining pattern statement: %G",
 247                          stmt_info->stmt);
 248       res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
 249       if (!res)
 250         return res;
 251     }
 252
 253   return opt_result::success ();
 254 }
 255
 256 /* Function vect_determine_vectorization_factor
 257
 258    Determine the vectorization factor (VF).  VF is the number of data elements
 259    that are operated upon in parallel in a single iteration of the vectorized
 260    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 261    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 262    elements can fit in a single vector register.
 263
 264    We currently support vectorization of loops in which all types operated upon
 265    are of the same size.  Therefore this function currently sets VF according to
 266    the size of the types operated upon, and fails if there are multiple sizes
 267    in the loop.
 268
 269    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 270    original loop:
 271         for (i=0; i<N; i++){
 272           a[i] = b[i] + c[i];
 273         }
 274
 275    vectorized loop:
 276         for (i=0; i<N; i+=VF){
 277           a[i:VF] = b[i:VF] + c[i:VF];
 278         }
 279 */
 280
 281 static opt_result
 282 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 283 {
 284   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 285   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 286   unsigned nbbs = loop->num_nodes;
 287   poly_uint64 vectorization_factor = 1;
 288   tree scalar_type = NULL_TREE;
 289   gphi *phi;
 290   tree vectype;
 291   stmt_vec_info stmt_info;
 292   unsigned i;
 293
 294   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 295
 296   for (i = 0; i < nbbs; i++)
 297     {
 298       basic_block bb = bbs[i];
 299
 300       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 301            gsi_next (&si))
 302         {
 303           phi = si.phi ();
 304           stmt_info = loop_vinfo->lookup_stmt (phi);
 305           if (dump_enabled_p ())
 306             dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
 307                              (gimple *) phi);
 308
 309           gcc_assert (stmt_info);
 310
 311           if (STMT_VINFO_RELEVANT_P (stmt_info)
 312               || STMT_VINFO_LIVE_P (stmt_info))
 313             {
 314               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 315               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 316
 317               if (dump_enabled_p ())
 318                 dump_printf_loc (MSG_NOTE, vect_location,
 319                                  "get vectype for scalar type:  %T\n",
 320                                  scalar_type);
 321
 322               vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 323               if (!vectype)
 324                 return opt_result::failure_at (phi,
 325                                                "not vectorized: unsupported "
 326                                                "data-type %T\n",
 327                                                scalar_type);
 328               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 329
 330               if (dump_enabled_p ())
 331                 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
 332                                  vectype);
 333
 334               if (dump_enabled_p ())
 335                 {
 336                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 337                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 338                   dump_printf (MSG_NOTE, "\n");
 339                 }
 340
 341               vect_update_max_nunits (&vectorization_factor, vectype);
 342             }
 343         }
 344
 345       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 346            gsi_next (&si))
 347         {
 348           if (is_gimple_debug (gsi_stmt (si)))
 349             continue;
 350           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 351           opt_result res
 352             = vect_determine_vf_for_stmt (loop_vinfo,
 353                                           stmt_info, &vectorization_factor);
 354           if (!res)
 355             return res;
 356         }
 357     }
 358
 359   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 360   if (dump_enabled_p ())
 361     {
 362       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 363       dump_dec (MSG_NOTE, vectorization_factor);
 364       dump_printf (MSG_NOTE, "\n");
 365     }
 366
 367   if (known_le (vectorization_factor, 1U))
 368     return opt_result::failure_at (vect_location,
 369                                    "not vectorized: unsupported data-type\n");
 370   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 371   return opt_result::success ();
 372 }
 373
 374
 375 /* Function vect_is_simple_iv_evolution.
 376
 377    FORNOW: A simple evolution of an induction variables in the loop is
 378    considered a polynomial evolution.  */
 379
 380 static bool
 381 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 382                              tree * step)
 383 {
 384   tree init_expr;
 385   tree step_expr;
 386   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 387   basic_block bb;
 388
 389   /* When there is no evolution in this loop, the evolution function
 390      is not "simple".  */
 391   if (evolution_part == NULL_TREE)
 392     return false;
 393
 394   /* When the evolution is a polynomial of degree >= 2
 395      the evolution function is not "simple".  */
 396   if (tree_is_chrec (evolution_part))
 397     return false;
 398
 399   step_expr = evolution_part;
 400   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 401
 402   if (dump_enabled_p ())
 403     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
 404                      step_expr, init_expr);
 405
 406   *init = init_expr;
 407   *step = step_expr;
 408
 409   if (TREE_CODE (step_expr) != INTEGER_CST
 410       && (TREE_CODE (step_expr) != SSA_NAME
 411           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 412               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 413           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 414               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 415                   || !flag_associative_math)))
 416       && (TREE_CODE (step_expr) != REAL_CST
 417           || !flag_associative_math))
 418     {
 419       if (dump_enabled_p ())
 420         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 421                          "step unknown.\n");
 422       return false;
 423     }
 424
 425   return true;
 426 }
 427
 428 /* Function vect_is_nonlinear_iv_evolution
 429
 430    Only support nonlinear induction for integer type
 431    1. neg
 432    2. mul by constant
 433    3. lshift/rshift by constant.
 434
 435    For neg induction, return a fake step as integer -1.  */
 436 static bool
 437 vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
 438                                 gphi* loop_phi_node, tree *init, tree *step)
 439 {
 440   tree init_expr, ev_expr, result, op1, op2;
 441   gimple* def;
 442
 443   if (gimple_phi_num_args (loop_phi_node) != 2)
 444     return false;
 445
 446   init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop));
 447   ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop));
 448
 449   /* Support nonlinear induction only for integer type.  */
 450   if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
 451     return false;
 452
 453   *init = init_expr;
 454   result = PHI_RESULT (loop_phi_node);
 455
 456   if (TREE_CODE (ev_expr) != SSA_NAME
 457       || ((def = SSA_NAME_DEF_STMT (ev_expr)), false)
 458       || !is_gimple_assign (def))
 459     return false;
 460
 461   enum tree_code t_code = gimple_assign_rhs_code (def);
 462   switch (t_code)
 463     {
 464     case NEGATE_EXPR:
 465       if (gimple_assign_rhs1 (def) != result)
 466         return false;
 467       *step = build_int_cst (TREE_TYPE (init_expr), -1);
 468       STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
 469       break;
 470
 471     case RSHIFT_EXPR:
 472     case LSHIFT_EXPR:
 473     case MULT_EXPR:
 474       op1 = gimple_assign_rhs1 (def);
 475       op2 = gimple_assign_rhs2 (def);
 476       if (TREE_CODE (op2) != INTEGER_CST
 477           || op1 != result)
 478         return false;
 479       *step = op2;
 480       if (t_code == LSHIFT_EXPR)
 481         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
 482       else if (t_code == RSHIFT_EXPR)
 483         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
 484       /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul.  */
 485       else
 486         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
 487       break;
 488
 489     default:
 490       return false;
 491     }
 492
 493   STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = *init;
 494   STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = *step;
 495
 496   return true;
 497 }
 498
 499 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
 500    what we are assuming is a double reduction.  For example, given
 501    a structure like this:
 502
 503       outer1:
 504         x_1 = PHI <x_4(outer2), ...>;
 505         ...
 506
 507       inner:
 508         x_2 = PHI <x_1(outer1), ...>;
 509         ...
 510         x_3 = ...;
 511         ...
 512
 513       outer2:
 514         x_4 = PHI <x_3(inner)>;
 515         ...
 516
 517    outer loop analysis would treat x_1 as a double reduction phi and
 518    this function would then return true for x_2.  */
 519
 520 static bool
 521 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
 522 {
 523   use_operand_p use_p;
 524   ssa_op_iter op_iter;
 525   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
 526     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
 527       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
 528         return true;
 529   return false;
 530 }
 531
 532 /* Function vect_analyze_scalar_cycles_1.
 533
 534    Examine the cross iteration def-use cycles of scalar variables
 535    in LOOP.  LOOP_VINFO represents the loop that is now being
 536    considered for vectorization (can be LOOP, or an outer-loop
 537    enclosing LOOP).  SLP indicates there will be some subsequent
 538    slp analyses or not.  */
 539
 540 static void
 541 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop,
 542                               bool slp)
 543 {
 544   basic_block bb = loop->header;
 545   tree init, step;
 546   auto_vec<stmt_vec_info, 64> worklist;
 547   gphi_iterator gsi;
 548   bool double_reduc, reduc_chain;
 549
 550   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 551
 552   /* First - identify all inductions.  Reduction detection assumes that all the
 553      inductions have been identified, therefore, this order must not be
 554      changed.  */
 555   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 556     {
 557       gphi *phi = gsi.phi ();
 558       tree access_fn = NULL;
 559       tree def = PHI_RESULT (phi);
 560       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 561
 562       if (dump_enabled_p ())
 563         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
 564                          (gimple *) phi);
 565
 566       /* Skip virtual phi's.  The data dependences that are associated with
 567          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 568       if (virtual_operand_p (def))
 569         continue;
 570
 571       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 572
 573       /* Analyze the evolution function.  */
 574       access_fn = analyze_scalar_evolution (loop, def);
 575       if (access_fn)
 576         {
 577           STRIP_NOPS (access_fn);
 578           if (dump_enabled_p ())
 579             dump_printf_loc (MSG_NOTE, vect_location,
 580                              "Access function of PHI: %T\n", access_fn);
 581           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 582             = initial_condition_in_loop_num (access_fn, loop->num);
 583           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 584             = evolution_part_in_loop_num (access_fn, loop->num);
 585         }
 586
 587       if ((!access_fn
 588            || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
 589            || !vect_is_simple_iv_evolution (loop->num, access_fn,
 590                                             &init, &step)
 591            || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 592                && TREE_CODE (step) != INTEGER_CST))
 593           /* Only handle nonlinear iv for same loop.  */
 594           && (LOOP_VINFO_LOOP (loop_vinfo) != loop
 595               || !vect_is_nonlinear_iv_evolution (loop, stmt_vinfo,
 596                                                   phi, &init, &step)))
 597         {
 598           worklist.safe_push (stmt_vinfo);
 599           continue;
 600         }
 601
 602       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 603                   != NULL_TREE);
 604       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 605
 606       if (dump_enabled_p ())
 607         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 608       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 609     }
 610
 611
 612   /* Second - identify all reductions and nested cycles.  */
 613   while (worklist.length () > 0)
 614     {
 615       stmt_vec_info stmt_vinfo = worklist.pop ();
 616       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
 617       tree def = PHI_RESULT (phi);
 618
 619       if (dump_enabled_p ())
 620         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
 621                          (gimple *) phi);
 622
 623       gcc_assert (!virtual_operand_p (def)
 624                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 625
 626       stmt_vec_info reduc_stmt_info
 627         = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
 628                                     &reduc_chain, slp);
 629       if (reduc_stmt_info)
 630         {
 631           STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
 632           STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
 633           if (double_reduc)
 634             {
 635               if (dump_enabled_p ())
 636                 dump_printf_loc (MSG_NOTE, vect_location,
 637                                  "Detected double reduction.\n");
 638
 639               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 640               STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
 641             }
 642           else
 643             {
 644               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 645                 {
 646                   if (dump_enabled_p ())
 647                     dump_printf_loc (MSG_NOTE, vect_location,
 648                                      "Detected vectorizable nested cycle.\n");
 649
 650                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 651                 }
 652               else
 653                 {
 654                   if (dump_enabled_p ())
 655                     dump_printf_loc (MSG_NOTE, vect_location,
 656                                      "Detected reduction.\n");
 657
 658                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 659                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
 660                   /* Store the reduction cycles for possible vectorization in
 661                      loop-aware SLP if it was not detected as reduction
 662                      chain.  */
 663                   if (! reduc_chain)
 664                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
 665                       (reduc_stmt_info);
 666                 }
 667             }
 668         }
 669       else
 670         if (dump_enabled_p ())
 671           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 672                            "Unknown def-use cycle pattern.\n");
 673     }
 674 }
 675
 676
 677 /* Function vect_analyze_scalar_cycles.
 678
 679    Examine the cross iteration def-use cycles of scalar variables, by
 680    analyzing the loop-header PHIs of scalar variables.  Classify each
 681    cycle as one of the following: invariant, induction, reduction, unknown.
 682    We do that for the loop represented by LOOP_VINFO, and also to its
 683    inner-loop, if exists.
 684    Examples for scalar cycles:
 685
 686    Example1: reduction:
 687
 688               loop1:
 689               for (i=0; i<N; i++)
 690                  sum += a[i];
 691
 692    Example2: induction:
 693
 694               loop2:
 695               for (i=0; i<N; i++)
 696                  a[i] = i;  */
 697
 698 static void
 699 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo, bool slp)
 700 {
 701   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 702
 703   vect_analyze_scalar_cycles_1 (loop_vinfo, loop, slp);
 704
 705   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 706      Reductions in such inner-loop therefore have different properties than
 707      the reductions in the nest that gets vectorized:
 708      1. When vectorized, they are executed in the same order as in the original
 709         scalar loop, so we can't change the order of computation when
 710         vectorizing them.
 711      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 712         current checks are too strict.  */
 713
 714   if (loop->inner)
 715     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner, slp);
 716 }
 717
 718 /* Transfer group and reduction information from STMT_INFO to its
 719    pattern stmt.  */
 720
 721 static void
 722 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
 723 {
 724   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
 725   stmt_vec_info stmtp;
 726   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
 727               && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
 728   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
 729   do
 730     {
 731       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
 732       gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
 733                            == STMT_VINFO_DEF_TYPE (stmt_info));
 734       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
 735       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
 736       if (stmt_info)
 737         REDUC_GROUP_NEXT_ELEMENT (stmtp)
 738           = STMT_VINFO_RELATED_STMT (stmt_info);
 739     }
 740   while (stmt_info);
 741 }
 742
 743 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 744
 745 static void
 746 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 747 {
 748   stmt_vec_info first;
 749   unsigned i;
 750
 751   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 752     {
 753       stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
 754       while (next)
 755         {
 756           if ((STMT_VINFO_IN_PATTERN_P (next)
 757                != STMT_VINFO_IN_PATTERN_P (first))
 758               || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
 759             break;
 760           next = REDUC_GROUP_NEXT_ELEMENT (next);
 761         }
 762       /* If all reduction chain members are well-formed patterns adjust
 763          the group to group the pattern stmts instead.  */
 764       if (! next
 765           && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
 766         {
 767           if (STMT_VINFO_IN_PATTERN_P (first))
 768             {
 769               vect_fixup_reduc_chain (first);
 770               LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 771                 = STMT_VINFO_RELATED_STMT (first);
 772             }
 773         }
 774       /* If not all stmt in the chain are patterns or if we failed
 775          to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
 776          it as regular reduction instead.  */
 777       else
 778         {
 779           stmt_vec_info vinfo = first;
 780           stmt_vec_info last = NULL;
 781           while (vinfo)
 782             {
 783               next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
 784               REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
 785               REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
 786               last = vinfo;
 787               vinfo = next;
 788             }
 789           STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
 790             = vect_internal_def;
 791           loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
 792           LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
 793           --i;
 794         }
 795     }
 796 }
 797
 798 /* Function vect_get_loop_niters.
 799
 800    Determine how many iterations the loop is executed and place it
 801    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 802    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 803    niter information holds in ASSUMPTIONS.
 804
 805    Return the loop exit condition.  */
 806
 807
 808 static gcond *
 809 vect_get_loop_niters (class loop *loop, tree *assumptions,
 810                       tree *number_of_iterations, tree *number_of_iterationsm1)
 811 {
 812   edge exit = single_exit (loop);
 813   class tree_niter_desc niter_desc;
 814   tree niter_assumptions, niter, may_be_zero;
 815   gcond *cond = get_loop_exit_condition (loop);
 816
 817   *assumptions = boolean_true_node;
 818   *number_of_iterationsm1 = chrec_dont_know;
 819   *number_of_iterations = chrec_dont_know;
 820   DUMP_VECT_SCOPE ("get_loop_niters");
 821
 822   if (!exit)
 823     return cond;
 824
 825   may_be_zero = NULL_TREE;
 826   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 827       || chrec_contains_undetermined (niter_desc.niter))
 828     return cond;
 829
 830   niter_assumptions = niter_desc.assumptions;
 831   may_be_zero = niter_desc.may_be_zero;
 832   niter = niter_desc.niter;
 833
 834   if (may_be_zero && integer_zerop (may_be_zero))
 835     may_be_zero = NULL_TREE;
 836
 837   if (may_be_zero)
 838     {
 839       if (COMPARISON_CLASS_P (may_be_zero))
 840         {
 841           /* Try to combine may_be_zero with assumptions, this can simplify
 842              computation of niter expression.  */
 843           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 844             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 845                                              niter_assumptions,
 846                                              fold_build1 (TRUTH_NOT_EXPR,
 847                                                           boolean_type_node,
 848                                                           may_be_zero));
 849           else
 850             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 851                                  build_int_cst (TREE_TYPE (niter), 0),
 852                                  rewrite_to_non_trapping_overflow (niter));
 853
 854           may_be_zero = NULL_TREE;
 855         }
 856       else if (integer_nonzerop (may_be_zero))
 857         {
 858           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 859           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 860           return cond;
 861         }
 862       else
 863         return cond;
 864     }
 865
 866   *assumptions = niter_assumptions;
 867   *number_of_iterationsm1 = niter;
 868
 869   /* We want the number of loop header executions which is the number
 870      of latch executions plus one.
 871      ???  For UINT_MAX latch executions this number overflows to zero
 872      for loops like do { n++; } while (n != 0);  */
 873   if (niter && !chrec_contains_undetermined (niter))
 874     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
 875                           build_int_cst (TREE_TYPE (niter), 1));
 876   *number_of_iterations = niter;
 877
 878   return cond;
 879 }
 880
 881 /* Function bb_in_loop_p
 882
 883    Used as predicate for dfs order traversal of the loop bbs.  */
 884
 885 static bool
 886 bb_in_loop_p (const_basic_block bb, const void *data)
 887 {
 888   const class loop *const loop = (const class loop *)data;
 889   if (flow_bb_inside_loop_p (loop, bb))
 890     return true;
 891   return false;
 892 }
 893
 894
 895 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
 896    stmt_vec_info structs for all the stmts in LOOP_IN.  */
 897
 898 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
 899   : vec_info (vec_info::loop, shared),
 900     loop (loop_in),
 901     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
 902     num_itersm1 (NULL_TREE),
 903     num_iters (NULL_TREE),
 904     num_iters_unchanged (NULL_TREE),
 905     num_iters_assumptions (NULL_TREE),
 906     vector_costs (nullptr),
 907     scalar_costs (nullptr),
 908     th (0),
 909     versioning_threshold (0),
 910     vectorization_factor (0),
 911     main_loop_edge (nullptr),
 912     skip_main_loop_edge (nullptr),
 913     skip_this_loop_edge (nullptr),
 914     reusable_accumulators (),
 915     suggested_unroll_factor (1),
 916     max_vectorization_factor (0),
 917     mask_skip_niters (NULL_TREE),
 918     rgroup_compare_type (NULL_TREE),
 919     simd_if_cond (NULL_TREE),
 920     unaligned_dr (NULL),
 921     peeling_for_alignment (0),
 922     ptr_mask (0),
 923     ivexpr_map (NULL),
 924     scan_map (NULL),
 925     slp_unrolling_factor (1),
 926     inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
 927     vectorizable (false),
 928     can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
 929     using_partial_vectors_p (false),
 930     epil_using_partial_vectors_p (false),
 931     partial_load_store_bias (0),
 932     peeling_for_gaps (false),
 933     peeling_for_niter (false),
 934     no_data_dependencies (false),
 935     has_mask_store (false),
 936     scalar_loop_scaling (profile_probability::uninitialized ()),
 937     scalar_loop (NULL),
 938     orig_loop_info (NULL)
 939 {
 940   /* CHECKME: We want to visit all BBs before their successors (except for
 941      latch blocks, for which this assertion wouldn't hold).  In the simple
 942      case of the loop forms we allow, a dfs order of the BBs would the same
 943      as reversed postorder traversal, so we are safe.  */
 944
 945   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 946                                           bbs, loop->num_nodes, loop);
 947   gcc_assert (nbbs == loop->num_nodes);
 948
 949   for (unsigned int i = 0; i < nbbs; i++)
 950     {
 951       basic_block bb = bbs[i];
 952       gimple_stmt_iterator si;
 953
 954       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 955         {
 956           gimple *phi = gsi_stmt (si);
 957           gimple_set_uid (phi, 0);
 958           add_stmt (phi);
 959         }
 960
 961       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 962         {
 963           gimple *stmt = gsi_stmt (si);
 964           gimple_set_uid (stmt, 0);
 965           if (is_gimple_debug (stmt))
 966             continue;
 967           add_stmt (stmt);
 968           /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
 969              third argument is the #pragma omp simd if (x) condition, when 0,
 970              loop shouldn't be vectorized, when non-zero constant, it should
 971              be vectorized normally, otherwise versioned with vectorized loop
 972              done if the condition is non-zero at runtime.  */
 973           if (loop_in->simduid
 974               && is_gimple_call (stmt)
 975               && gimple_call_internal_p (stmt)
 976               && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
 977               && gimple_call_num_args (stmt) >= 3
 978               && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
 979               && (loop_in->simduid
 980                   == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
 981             {
 982               tree arg = gimple_call_arg (stmt, 2);
 983               if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
 984                 simd_if_cond = arg;
 985               else
 986                 gcc_assert (integer_nonzerop (arg));
 987             }
 988         }
 989     }
 990
 991   epilogue_vinfos.create (6);
 992 }
 993
 994 /* Free all levels of rgroup CONTROLS.  */
 995
 996 void
 997 release_vec_loop_controls (vec<rgroup_controls> *controls)
 998 {
 999   rgroup_controls *rgc;
1000   unsigned int i;
1001   FOR_EACH_VEC_ELT (*controls, i, rgc)
1002     rgc->controls.release ();
1003   controls->release ();
1004 }
1005
1006 /* Free all memory used by the _loop_vec_info, as well as all the
1007    stmt_vec_info structs of all the stmts in the loop.  */
1008
1009 _loop_vec_info::~_loop_vec_info ()
1010 {
1011   free (bbs);
1012
1013   release_vec_loop_controls (&masks);
1014   release_vec_loop_controls (&lens);
1015   delete ivexpr_map;
1016   delete scan_map;
1017   epilogue_vinfos.release ();
1018   delete scalar_costs;
1019   delete vector_costs;
1020
1021   /* When we release an epiloge vinfo that we do not intend to use
1022      avoid clearing AUX of the main loop which should continue to
1023      point to the main loop vinfo since otherwise we'll leak that.  */
1024   if (loop->aux == this)
1025     loop->aux = NULL;
1026 }
1027
1028 /* Return an invariant or register for EXPR and emit necessary
1029    computations in the LOOP_VINFO loop preheader.  */
1030
1031 tree
1032 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
1033 {
1034   if (is_gimple_reg (expr)
1035       || is_gimple_min_invariant (expr))
1036     return expr;
1037
1038   if (! loop_vinfo->ivexpr_map)
1039     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
1040   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
1041   if (! cached)
1042     {
1043       gimple_seq stmts = NULL;
1044       cached = force_gimple_operand (unshare_expr (expr),
1045                                      &stmts, true, NULL_TREE);
1046       if (stmts)
1047         {
1048           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1049           gsi_insert_seq_on_edge_immediate (e, stmts);
1050         }
1051     }
1052   return cached;
1053 }
1054
1055 /* Return true if we can use CMP_TYPE as the comparison type to produce
1056    all masks required to mask LOOP_VINFO.  */
1057
1058 static bool
1059 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1060 {
1061   rgroup_controls *rgm;
1062   unsigned int i;
1063   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1064     if (rgm->type != NULL_TREE
1065         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1066                                             cmp_type, rgm->type,
1067                                             OPTIMIZE_FOR_SPEED))
1068       return false;
1069   return true;
1070 }
1071
1072 /* Calculate the maximum number of scalars per iteration for every
1073    rgroup in LOOP_VINFO.  */
1074
1075 static unsigned int
1076 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1077 {
1078   unsigned int res = 1;
1079   unsigned int i;
1080   rgroup_controls *rgm;
1081   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1082     res = MAX (res, rgm->max_nscalars_per_iter);
1083   return res;
1084 }
1085
1086 /* Calculate the minimum precision necessary to represent:
1087
1088       MAX_NITERS * FACTOR
1089
1090    as an unsigned integer, where MAX_NITERS is the maximum number of
1091    loop header iterations for the original scalar form of LOOP_VINFO.  */
1092
1093 static unsigned
1094 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1095 {
1096   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1097
1098   /* Get the maximum number of iterations that is representable
1099      in the counter type.  */
1100   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1101   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1102
1103   /* Get a more refined estimate for the number of iterations.  */
1104   widest_int max_back_edges;
1105   if (max_loop_iterations (loop, &max_back_edges))
1106     max_ni = wi::smin (max_ni, max_back_edges + 1);
1107
1108   /* Work out how many bits we need to represent the limit.  */
1109   return wi::min_precision (max_ni * factor, UNSIGNED);
1110 }
1111
1112 /* True if the loop needs peeling or partial vectors when vectorized.  */
1113
1114 static bool
1115 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1116 {
1117   unsigned HOST_WIDE_INT const_vf;
1118   HOST_WIDE_INT max_niter
1119     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1120
1121   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1122   if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1123     th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1124                                           (loop_vinfo));
1125
1126   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1127       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1128     {
1129       /* Work out the (constant) number of iterations that need to be
1130          peeled for reasons other than niters.  */
1131       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1132       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1133         peel_niter += 1;
1134       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1135                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1136         return true;
1137     }
1138   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1139       /* ??? When peeling for gaps but not alignment, we could
1140          try to check whether the (variable) niters is known to be
1141          VF * N + 1.  That's something of a niche case though.  */
1142       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1143       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1144       || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1145            < (unsigned) exact_log2 (const_vf))
1146           /* In case of versioning, check if the maximum number of
1147              iterations is greater than th.  If they are identical,
1148              the epilogue is unnecessary.  */
1149           && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1150               || ((unsigned HOST_WIDE_INT) max_niter
1151                   > (th / const_vf) * const_vf))))
1152     return true;
1153
1154   return false;
1155 }
1156
1157 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1158    whether we can actually generate the masks required.  Return true if so,
1159    storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE.  */
1160
1161 static bool
1162 vect_verify_full_masking (loop_vec_info loop_vinfo)
1163 {
1164   unsigned int min_ni_width;
1165   unsigned int max_nscalars_per_iter
1166     = vect_get_max_nscalars_per_iter (loop_vinfo);
1167
1168   /* Use a normal loop if there are no statements that need masking.
1169      This only happens in rare degenerate cases: it means that the loop
1170      has no loads, no stores, and no live-out values.  */
1171   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1172     return false;
1173
1174   /* Work out how many bits we need to represent the limit.  */
1175   min_ni_width
1176     = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1177
1178   /* Find a scalar mode for which WHILE_ULT is supported.  */
1179   opt_scalar_int_mode cmp_mode_iter;
1180   tree cmp_type = NULL_TREE;
1181   tree iv_type = NULL_TREE;
1182   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1183   unsigned int iv_precision = UINT_MAX;
1184
1185   if (iv_limit != -1)
1186     iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1187                                       UNSIGNED);
1188
1189   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1190     {
1191       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1192       if (cmp_bits >= min_ni_width
1193           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1194         {
1195           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1196           if (this_type
1197               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1198             {
1199               /* Although we could stop as soon as we find a valid mode,
1200                  there are at least two reasons why that's not always the
1201                  best choice:
1202
1203                  - An IV that's Pmode or wider is more likely to be reusable
1204                    in address calculations than an IV that's narrower than
1205                    Pmode.
1206
1207                  - Doing the comparison in IV_PRECISION or wider allows
1208                    a natural 0-based IV, whereas using a narrower comparison
1209                    type requires mitigations against wrap-around.
1210
1211                  Conversely, if the IV limit is variable, doing the comparison
1212                  in a wider type than the original type can introduce
1213                  unnecessary extensions, so picking the widest valid mode
1214                  is not always a good choice either.
1215
1216                  Here we prefer the first IV type that's Pmode or wider,
1217                  and the first comparison type that's IV_PRECISION or wider.
1218                  (The comparison type must be no wider than the IV type,
1219                  to avoid extensions in the vector loop.)
1220
1221                  ??? We might want to try continuing beyond Pmode for ILP32
1222                  targets if CMP_BITS < IV_PRECISION.  */
1223               iv_type = this_type;
1224               if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1225                 cmp_type = this_type;
1226               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1227                 break;
1228             }
1229         }
1230     }
1231
1232   if (!cmp_type)
1233     return false;
1234
1235   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1236   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1237   return true;
1238 }
1239
1240 /* Check whether we can use vector access with length based on precison
1241    comparison.  So far, to keep it simple, we only allow the case that the
1242    precision of the target supported length is larger than the precision
1243    required by loop niters.  */
1244
1245 static bool
1246 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1247 {
1248   if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1249     return false;
1250
1251   machine_mode len_load_mode = get_len_load_store_mode
1252     (loop_vinfo->vector_mode, true).require ();
1253   machine_mode len_store_mode = get_len_load_store_mode
1254     (loop_vinfo->vector_mode, false).require ();
1255
1256   signed char partial_load_bias = internal_len_load_store_bias
1257     (IFN_LEN_LOAD, len_load_mode);
1258
1259   signed char partial_store_bias = internal_len_load_store_bias
1260     (IFN_LEN_STORE, len_store_mode);
1261
1262   gcc_assert (partial_load_bias == partial_store_bias);
1263
1264   if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1265     return false;
1266
1267   /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1268      len_loads with a length of zero.  In order to avoid that we prohibit
1269      more than one loop length here.  */
1270   if (partial_load_bias == -1
1271       && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1272     return false;
1273
1274   LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1275
1276   unsigned int max_nitems_per_iter = 1;
1277   unsigned int i;
1278   rgroup_controls *rgl;
1279   /* Find the maximum number of items per iteration for every rgroup.  */
1280   FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1281     {
1282       unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1283       max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1284     }
1285
1286   /* Work out how many bits we need to represent the length limit.  */
1287   unsigned int min_ni_prec
1288     = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1289
1290   /* Now use the maximum of below precisions for one suitable IV type:
1291      - the IV's natural precision
1292      - the precision needed to hold: the maximum number of scalar
1293        iterations multiplied by the scale factor (min_ni_prec above)
1294      - the Pmode precision
1295
1296      If min_ni_prec is less than the precision of the current niters,
1297      we perfer to still use the niters type.  Prefer to use Pmode and
1298      wider IV to avoid narrow conversions.  */
1299
1300   unsigned int ni_prec
1301     = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1302   min_ni_prec = MAX (min_ni_prec, ni_prec);
1303   min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1304
1305   tree iv_type = NULL_TREE;
1306   opt_scalar_int_mode tmode_iter;
1307   FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1308     {
1309       scalar_mode tmode = tmode_iter.require ();
1310       unsigned int tbits = GET_MODE_BITSIZE (tmode);
1311
1312       /* ??? Do we really want to construct one IV whose precision exceeds
1313          BITS_PER_WORD?  */
1314       if (tbits > BITS_PER_WORD)
1315         break;
1316
1317       /* Find the first available standard integral type.  */
1318       if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1319         {
1320           iv_type = build_nonstandard_integer_type (tbits, true);
1321           break;
1322         }
1323     }
1324
1325   if (!iv_type)
1326     {
1327       if (dump_enabled_p ())
1328         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1329                          "can't vectorize with length-based partial vectors"
1330                          " because there is no suitable iv type.\n");
1331       return false;
1332     }
1333
1334   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1335   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1336
1337   return true;
1338 }
1339
1340 /* Calculate the cost of one scalar iteration of the loop.  */
1341 static void
1342 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1343 {
1344   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1345   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1346   int nbbs = loop->num_nodes, factor;
1347   int innerloop_iters, i;
1348
1349   DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1350
1351   /* Gather costs for statements in the scalar loop.  */
1352
1353   /* FORNOW.  */
1354   innerloop_iters = 1;
1355   if (loop->inner)
1356     innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1357
1358   for (i = 0; i < nbbs; i++)
1359     {
1360       gimple_stmt_iterator si;
1361       basic_block bb = bbs[i];
1362
1363       if (bb->loop_father == loop->inner)
1364         factor = innerloop_iters;
1365       else
1366         factor = 1;
1367
1368       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1369         {
1370           gimple *stmt = gsi_stmt (si);
1371           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1372
1373           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1374             continue;
1375
1376           /* Skip stmts that are not vectorized inside the loop.  */
1377           stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1378           if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1379               && (!STMT_VINFO_LIVE_P (vstmt_info)
1380                   || !VECTORIZABLE_CYCLE_DEF
1381                         (STMT_VINFO_DEF_TYPE (vstmt_info))))
1382             continue;
1383
1384           vect_cost_for_stmt kind;
1385           if (STMT_VINFO_DATA_REF (stmt_info))
1386             {
1387               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1388                kind = scalar_load;
1389              else
1390                kind = scalar_store;
1391             }
1392           else if (vect_nop_conversion_p (stmt_info))
1393             continue;
1394           else
1395             kind = scalar_stmt;
1396
1397           /* We are using vect_prologue here to avoid scaling twice
1398              by the inner loop factor.  */
1399           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1400                             factor, kind, stmt_info, 0, vect_prologue);
1401         }
1402     }
1403
1404   /* Now accumulate cost.  */
1405   loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1406   add_stmt_costs (loop_vinfo->scalar_costs,
1407                   &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1408   loop_vinfo->scalar_costs->finish_cost (nullptr);
1409 }
1410
1411
1412 /* Function vect_analyze_loop_form.
1413
1414    Verify that certain CFG restrictions hold, including:
1415    - the loop has a pre-header
1416    - the loop has a single entry and exit
1417    - the loop exit condition is simple enough
1418    - the number of iterations can be analyzed, i.e, a countable loop.  The
1419      niter could be analyzed under some assumptions.  */
1420
1421 opt_result
1422 vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info)
1423 {
1424   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1425
1426   /* Different restrictions apply when we are considering an inner-most loop,
1427      vs. an outer (nested) loop.
1428      (FORNOW. May want to relax some of these restrictions in the future).  */
1429
1430   info->inner_loop_cond = NULL;
1431   if (!loop->inner)
1432     {
1433       /* Inner-most loop.  We currently require that the number of BBs is
1434          exactly 2 (the header and latch).  Vectorizable inner-most loops
1435          look like this:
1436
1437                         (pre-header)
1438                            |
1439                           header <--------+
1440                            | |            |
1441                            | +--> latch --+
1442                            |
1443                         (exit-bb)  */
1444
1445       if (loop->num_nodes != 2)
1446         return opt_result::failure_at (vect_location,
1447                                        "not vectorized:"
1448                                        " control flow in loop.\n");
1449
1450       if (empty_block_p (loop->header))
1451         return opt_result::failure_at (vect_location,
1452                                        "not vectorized: empty loop.\n");
1453     }
1454   else
1455     {
1456       class loop *innerloop = loop->inner;
1457       edge entryedge;
1458
1459       /* Nested loop. We currently require that the loop is doubly-nested,
1460          contains a single inner loop, and the number of BBs is exactly 5.
1461          Vectorizable outer-loops look like this:
1462
1463                         (pre-header)
1464                            |
1465                           header <---+
1466                            |         |
1467                           inner-loop |
1468                            |         |
1469                           tail ------+
1470                            |
1471                         (exit-bb)
1472
1473          The inner-loop has the properties expected of inner-most loops
1474          as described above.  */
1475
1476       if ((loop->inner)->inner || (loop->inner)->next)
1477         return opt_result::failure_at (vect_location,
1478                                        "not vectorized:"
1479                                        " multiple nested loops.\n");
1480
1481       if (loop->num_nodes != 5)
1482         return opt_result::failure_at (vect_location,
1483                                        "not vectorized:"
1484                                        " control flow in loop.\n");
1485
1486       entryedge = loop_preheader_edge (innerloop);
1487       if (entryedge->src != loop->header
1488           || !single_exit (innerloop)
1489           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1490         return opt_result::failure_at (vect_location,
1491                                        "not vectorized:"
1492                                        " unsupported outerloop form.\n");
1493
1494       /* Analyze the inner-loop.  */
1495       vect_loop_form_info inner;
1496       opt_result res = vect_analyze_loop_form (loop->inner, &inner);
1497       if (!res)
1498         {
1499           if (dump_enabled_p ())
1500             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1501                              "not vectorized: Bad inner loop.\n");
1502           return res;
1503         }
1504
1505       /* Don't support analyzing niter under assumptions for inner
1506          loop.  */
1507       if (!integer_onep (inner.assumptions))
1508         return opt_result::failure_at (vect_location,
1509                                        "not vectorized: Bad inner loop.\n");
1510
1511       if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1512         return opt_result::failure_at (vect_location,
1513                                        "not vectorized: inner-loop count not"
1514                                        " invariant.\n");
1515
1516       if (dump_enabled_p ())
1517         dump_printf_loc (MSG_NOTE, vect_location,
1518                          "Considering outer-loop vectorization.\n");
1519       info->inner_loop_cond = inner.loop_cond;
1520     }
1521
1522   if (!single_exit (loop))
1523     return opt_result::failure_at (vect_location,
1524                                    "not vectorized: multiple exits.\n");
1525   if (EDGE_COUNT (loop->header->preds) != 2)
1526     return opt_result::failure_at (vect_location,
1527                                    "not vectorized:"
1528                                    " too many incoming edges.\n");
1529
1530   /* We assume that the loop exit condition is at the end of the loop. i.e,
1531      that the loop is represented as a do-while (with a proper if-guard
1532      before the loop if needed), where the loop header contains all the
1533      executable statements, and the latch is empty.  */
1534   if (!empty_block_p (loop->latch)
1535       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1536     return opt_result::failure_at (vect_location,
1537                                    "not vectorized: latch block not empty.\n");
1538
1539   /* Make sure the exit is not abnormal.  */
1540   edge e = single_exit (loop);
1541   if (e->flags & EDGE_ABNORMAL)
1542     return opt_result::failure_at (vect_location,
1543                                    "not vectorized:"
1544                                    " abnormal loop exit edge.\n");
1545
1546   info->loop_cond
1547     = vect_get_loop_niters (loop, &info->assumptions,
1548                             &info->number_of_iterations,
1549                             &info->number_of_iterationsm1);
1550   if (!info->loop_cond)
1551     return opt_result::failure_at
1552       (vect_location,
1553        "not vectorized: complicated exit condition.\n");
1554
1555   if (integer_zerop (info->assumptions)
1556       || !info->number_of_iterations
1557       || chrec_contains_undetermined (info->number_of_iterations))
1558     return opt_result::failure_at
1559       (info->loop_cond,
1560        "not vectorized: number of iterations cannot be computed.\n");
1561
1562   if (integer_zerop (info->number_of_iterations))
1563     return opt_result::failure_at
1564       (info->loop_cond,
1565        "not vectorized: number of iterations = 0.\n");
1566
1567   if (!(tree_fits_shwi_p (info->number_of_iterations)
1568         && tree_to_shwi (info->number_of_iterations) > 0))
1569     {
1570       if (dump_enabled_p ())
1571         {
1572           dump_printf_loc (MSG_NOTE, vect_location,
1573                            "Symbolic number of iterations is ");
1574           dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1575           dump_printf (MSG_NOTE, "\n");
1576         }
1577     }
1578
1579   return opt_result::success ();
1580 }
1581
1582 /* Create a loop_vec_info for LOOP with SHARED and the
1583    vect_analyze_loop_form result.  */
1584
1585 loop_vec_info
1586 vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1587                         const vect_loop_form_info *info,
1588                         loop_vec_info main_loop_info)
1589 {
1590   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1591   LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1592   LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1593   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1594   LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = main_loop_info;
1595   /* Also record the assumptions for versioning.  */
1596   if (!integer_onep (info->assumptions) && !main_loop_info)
1597     LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1598
1599   stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (info->loop_cond);
1600   STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1601   if (info->inner_loop_cond)
1602     {
1603       stmt_vec_info inner_loop_cond_info
1604         = loop_vinfo->lookup_stmt (info->inner_loop_cond);
1605       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1606       /* If we have an estimate on the number of iterations of the inner
1607          loop use that to limit the scale for costing, otherwise use
1608          --param vect-inner-loop-cost-factor literally.  */
1609       widest_int nit;
1610       if (estimated_stmt_executions (loop->inner, &nit))
1611         LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1612           = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1613     }
1614
1615   return loop_vinfo;
1616 }
1617
1618
1619
1620 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1621    statements update the vectorization factor.  */
1622
1623 static void
1624 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1625 {
1626   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1627   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1628   int nbbs = loop->num_nodes;
1629   poly_uint64 vectorization_factor;
1630   int i;
1631
1632   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1633
1634   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1635   gcc_assert (known_ne (vectorization_factor, 0U));
1636
1637   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1638      vectorization factor of the loop is the unrolling factor required by
1639      the SLP instances.  If that unrolling factor is 1, we say, that we
1640      perform pure SLP on loop - cross iteration parallelism is not
1641      exploited.  */
1642   bool only_slp_in_loop = true;
1643   for (i = 0; i < nbbs; i++)
1644     {
1645       basic_block bb = bbs[i];
1646       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1647            gsi_next (&si))
1648         {
1649           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1650           if (!stmt_info)
1651             continue;
1652           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1653                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1654               && !PURE_SLP_STMT (stmt_info))
1655             /* STMT needs both SLP and loop-based vectorization.  */
1656             only_slp_in_loop = false;
1657         }
1658       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1659            gsi_next (&si))
1660         {
1661           if (is_gimple_debug (gsi_stmt (si)))
1662             continue;
1663           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1664           stmt_info = vect_stmt_to_vectorize (stmt_info);
1665           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1666                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1667               && !PURE_SLP_STMT (stmt_info))
1668             /* STMT needs both SLP and loop-based vectorization.  */
1669             only_slp_in_loop = false;
1670         }
1671     }
1672
1673   if (only_slp_in_loop)
1674     {
1675       if (dump_enabled_p ())
1676         dump_printf_loc (MSG_NOTE, vect_location,
1677                          "Loop contains only SLP stmts\n");
1678       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1679     }
1680   else
1681     {
1682       if (dump_enabled_p ())
1683         dump_printf_loc (MSG_NOTE, vect_location,
1684                          "Loop contains SLP and non-SLP stmts\n");
1685       /* Both the vectorization factor and unroll factor have the form
1686          GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1687          so they must have a common multiple.  */
1688       vectorization_factor
1689         = force_common_multiple (vectorization_factor,
1690                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1691     }
1692
1693   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1694   if (dump_enabled_p ())
1695     {
1696       dump_printf_loc (MSG_NOTE, vect_location,
1697                        "Updating vectorization factor to ");
1698       dump_dec (MSG_NOTE, vectorization_factor);
1699       dump_printf (MSG_NOTE, ".\n");
1700     }
1701 }
1702
1703 /* Return true if STMT_INFO describes a double reduction phi and if
1704    the other phi in the reduction is also relevant for vectorization.
1705    This rejects cases such as:
1706
1707       outer1:
1708         x_1 = PHI <x_3(outer2), ...>;
1709         ...
1710
1711       inner:
1712         x_2 = ...;
1713         ...
1714
1715       outer2:
1716         x_3 = PHI <x_2(inner)>;
1717
1718    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1719
1720 static bool
1721 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1722 {
1723   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1724     return false;
1725
1726   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1727 }
1728
1729 /* Function vect_analyze_loop_operations.
1730
1731    Scan the loop stmts and make sure they are all vectorizable.  */
1732
1733 static opt_result
1734 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1735 {
1736   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1737   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1738   int nbbs = loop->num_nodes;
1739   int i;
1740   stmt_vec_info stmt_info;
1741   bool need_to_vectorize = false;
1742   bool ok;
1743
1744   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1745
1746   auto_vec<stmt_info_for_cost> cost_vec;
1747
1748   for (i = 0; i < nbbs; i++)
1749     {
1750       basic_block bb = bbs[i];
1751
1752       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1753            gsi_next (&si))
1754         {
1755           gphi *phi = si.phi ();
1756           ok = true;
1757
1758           stmt_info = loop_vinfo->lookup_stmt (phi);
1759           if (dump_enabled_p ())
1760             dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G",
1761                              (gimple *) phi);
1762           if (virtual_operand_p (gimple_phi_result (phi)))
1763             continue;
1764
1765           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1766              (i.e., a phi in the tail of the outer-loop).  */
1767           if (! is_loop_header_bb_p (bb))
1768             {
1769               /* FORNOW: we currently don't support the case that these phis
1770                  are not used in the outerloop (unless it is double reduction,
1771                  i.e., this phi is vect_reduction_def), cause this case
1772                  requires to actually do something here.  */
1773               if (STMT_VINFO_LIVE_P (stmt_info)
1774                   && !vect_active_double_reduction_p (stmt_info))
1775                 return opt_result::failure_at (phi,
1776                                                "Unsupported loop-closed phi"
1777                                                " in outer-loop.\n");
1778
1779               /* If PHI is used in the outer loop, we check that its operand
1780                  is defined in the inner loop.  */
1781               if (STMT_VINFO_RELEVANT_P (stmt_info))
1782                 {
1783                   tree phi_op;
1784
1785                   if (gimple_phi_num_args (phi) != 1)
1786                     return opt_result::failure_at (phi, "unsupported phi");
1787
1788                   phi_op = PHI_ARG_DEF (phi, 0);
1789                   stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1790                   if (!op_def_info)
1791                     return opt_result::failure_at (phi, "unsupported phi\n");
1792
1793                   if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1794                       && (STMT_VINFO_RELEVANT (op_def_info)
1795                           != vect_used_in_outer_by_reduction))
1796                     return opt_result::failure_at (phi, "unsupported phi\n");
1797
1798                   if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1799                        || (STMT_VINFO_DEF_TYPE (stmt_info)
1800                            == vect_double_reduction_def))
1801                       && !vectorizable_lc_phi (loop_vinfo,
1802                                                stmt_info, NULL, NULL))
1803                     return opt_result::failure_at (phi, "unsupported phi\n");
1804                 }
1805
1806               continue;
1807             }
1808
1809           gcc_assert (stmt_info);
1810
1811           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1812                || STMT_VINFO_LIVE_P (stmt_info))
1813               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1814             /* A scalar-dependence cycle that we don't support.  */
1815             return opt_result::failure_at (phi,
1816                                            "not vectorized:"
1817                                            " scalar dependence cycle.\n");
1818
1819           if (STMT_VINFO_RELEVANT_P (stmt_info))
1820             {
1821               need_to_vectorize = true;
1822               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1823                   && ! PURE_SLP_STMT (stmt_info))
1824                 ok = vectorizable_induction (loop_vinfo,
1825                                              stmt_info, NULL, NULL,
1826                                              &cost_vec);
1827               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1828                         || (STMT_VINFO_DEF_TYPE (stmt_info)
1829                             == vect_double_reduction_def)
1830                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1831                        && ! PURE_SLP_STMT (stmt_info))
1832                 ok = vectorizable_reduction (loop_vinfo,
1833                                              stmt_info, NULL, NULL, &cost_vec);
1834             }
1835
1836           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1837           if (ok
1838               && STMT_VINFO_LIVE_P (stmt_info)
1839               && !PURE_SLP_STMT (stmt_info))
1840             ok = vectorizable_live_operation (loop_vinfo,
1841                                               stmt_info, NULL, NULL, NULL,
1842                                               -1, false, &cost_vec);
1843
1844           if (!ok)
1845             return opt_result::failure_at (phi,
1846                                            "not vectorized: relevant phi not "
1847                                            "supported: %G",
1848                                            static_cast <gimple *> (phi));
1849         }
1850
1851       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1852            gsi_next (&si))
1853         {
1854           gimple *stmt = gsi_stmt (si);
1855           if (!gimple_clobber_p (stmt)
1856               && !is_gimple_debug (stmt))
1857             {
1858               opt_result res
1859                 = vect_analyze_stmt (loop_vinfo,
1860                                      loop_vinfo->lookup_stmt (stmt),
1861                                      &need_to_vectorize,
1862                                      NULL, NULL, &cost_vec);
1863               if (!res)
1864                 return res;
1865             }
1866         }
1867     } /* bbs */
1868
1869   add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
1870
1871   /* All operations in the loop are either irrelevant (deal with loop
1872      control, or dead), or only used outside the loop and can be moved
1873      out of the loop (e.g. invariants, inductions).  The loop can be
1874      optimized away by scalar optimizations.  We're better off not
1875      touching this loop.  */
1876   if (!need_to_vectorize)
1877     {
1878       if (dump_enabled_p ())
1879         dump_printf_loc (MSG_NOTE, vect_location,
1880                          "All the computation can be taken out of the loop.\n");
1881       return opt_result::failure_at
1882         (vect_location,
1883          "not vectorized: redundant loop. no profit to vectorize.\n");
1884     }
1885
1886   return opt_result::success ();
1887 }
1888
1889 /* Return true if we know that the iteration count is smaller than the
1890    vectorization factor.  Return false if it isn't, or if we can't be sure
1891    either way.  */
1892
1893 static bool
1894 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
1895 {
1896   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1897
1898   HOST_WIDE_INT max_niter;
1899   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1900     max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1901   else
1902     max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1903
1904   if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1905     return true;
1906
1907   return false;
1908 }
1909
1910 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1911    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1912    definitely no, or -1 if it's worth retrying.  */
1913
1914 static int
1915 vect_analyze_loop_costing (loop_vec_info loop_vinfo,
1916                            unsigned *suggested_unroll_factor)
1917 {
1918   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1919   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1920
1921   /* Only loops that can handle partially-populated vectors can have iteration
1922      counts less than the vectorization factor.  */
1923   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1924     {
1925       if (vect_known_niters_smaller_than_vf (loop_vinfo))
1926         {
1927           if (dump_enabled_p ())
1928             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1929                              "not vectorized: iteration count smaller than "
1930                              "vectorization factor.\n");
1931           return 0;
1932         }
1933     }
1934
1935   /* If using the "very cheap" model. reject cases in which we'd keep
1936      a copy of the scalar code (even if we might be able to vectorize it).  */
1937   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
1938       && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1939           || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1940           || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
1941     {
1942       if (dump_enabled_p ())
1943         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1944                          "some scalar iterations would need to be peeled\n");
1945       return 0;
1946     }
1947
1948   int min_profitable_iters, min_profitable_estimate;
1949   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1950                                       &min_profitable_estimate,
1951                                       suggested_unroll_factor);
1952
1953   if (min_profitable_iters < 0)
1954     {
1955       if (dump_enabled_p ())
1956         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1957                          "not vectorized: vectorization not profitable.\n");
1958       if (dump_enabled_p ())
1959         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1960                          "not vectorized: vector version will never be "
1961                          "profitable.\n");
1962       return -1;
1963     }
1964
1965   int min_scalar_loop_bound = (param_min_vect_loop_bound
1966                                * assumed_vf);
1967
1968   /* Use the cost model only if it is more conservative than user specified
1969      threshold.  */
1970   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1971                                     min_profitable_iters);
1972
1973   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1974
1975   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1976       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1977     {
1978       if (dump_enabled_p ())
1979         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1980                          "not vectorized: vectorization not profitable.\n");
1981       if (dump_enabled_p ())
1982         dump_printf_loc (MSG_NOTE, vect_location,
1983                          "not vectorized: iteration count smaller than user "
1984                          "specified loop bound parameter or minimum profitable "
1985                          "iterations (whichever is more conservative).\n");
1986       return 0;
1987     }
1988
1989   /* The static profitablity threshold min_profitable_estimate includes
1990      the cost of having to check at runtime whether the scalar loop
1991      should be used instead.  If it turns out that we don't need or want
1992      such a check, the threshold we should use for the static estimate
1993      is simply the point at which the vector loop becomes more profitable
1994      than the scalar loop.  */
1995   if (min_profitable_estimate > min_profitable_iters
1996       && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
1997       && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
1998       && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1999       && !vect_apply_runtime_profitability_check_p (loop_vinfo))
2000     {
2001       if (dump_enabled_p ())
2002         dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
2003                          " choice between the scalar and vector loops\n");
2004       min_profitable_estimate = min_profitable_iters;
2005     }
2006
2007   /* If the vector loop needs multiple iterations to be beneficial then
2008      things are probably too close to call, and the conservative thing
2009      would be to stick with the scalar code.  */
2010   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2011       && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
2012     {
2013       if (dump_enabled_p ())
2014         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2015                          "one iteration of the vector loop would be"
2016                          " more expensive than the equivalent number of"
2017                          " iterations of the scalar loop\n");
2018       return 0;
2019     }
2020
2021   HOST_WIDE_INT estimated_niter;
2022
2023   /* If we are vectorizing an epilogue then we know the maximum number of
2024      scalar iterations it will cover is at least one lower than the
2025      vectorization factor of the main loop.  */
2026   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2027     estimated_niter
2028       = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
2029   else
2030     {
2031       estimated_niter = estimated_stmt_executions_int (loop);
2032       if (estimated_niter == -1)
2033         estimated_niter = likely_max_stmt_executions_int (loop);
2034     }
2035   if (estimated_niter != -1
2036       && ((unsigned HOST_WIDE_INT) estimated_niter
2037           < MAX (th, (unsigned) min_profitable_estimate)))
2038     {
2039       if (dump_enabled_p ())
2040         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2041                          "not vectorized: estimated iteration count too "
2042                          "small.\n");
2043       if (dump_enabled_p ())
2044         dump_printf_loc (MSG_NOTE, vect_location,
2045                          "not vectorized: estimated iteration count smaller "
2046                          "than specified loop bound parameter or minimum "
2047                          "profitable iterations (whichever is more "
2048                          "conservative).\n");
2049       return -1;
2050     }
2051
2052   return 1;
2053 }
2054
2055 static opt_result
2056 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
2057                            vec<data_reference_p> *datarefs,
2058                            unsigned int *n_stmts)
2059 {
2060   *n_stmts = 0;
2061   for (unsigned i = 0; i < loop->num_nodes; i++)
2062     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2063          !gsi_end_p (gsi); gsi_next (&gsi))
2064       {
2065         gimple *stmt = gsi_stmt (gsi);
2066         if (is_gimple_debug (stmt))
2067           continue;
2068         ++(*n_stmts);
2069         opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
2070                                                         NULL, 0);
2071         if (!res)
2072           {
2073             if (is_gimple_call (stmt) && loop->safelen)
2074               {
2075                 tree fndecl = gimple_call_fndecl (stmt), op;
2076                 if (fndecl != NULL_TREE)
2077                   {
2078                     cgraph_node *node = cgraph_node::get (fndecl);
2079                     if (node != NULL && node->simd_clones != NULL)
2080                       {
2081                         unsigned int j, n = gimple_call_num_args (stmt);
2082                         for (j = 0; j < n; j++)
2083                           {
2084                             op = gimple_call_arg (stmt, j);
2085                             if (DECL_P (op)
2086                                 || (REFERENCE_CLASS_P (op)
2087                                     && get_base_address (op)))
2088                               break;
2089                           }
2090                         op = gimple_call_lhs (stmt);
2091                         /* Ignore #pragma omp declare simd functions
2092                            if they don't have data references in the
2093                            call stmt itself.  */
2094                         if (j == n
2095                             && !(op
2096                                  && (DECL_P (op)
2097                                      || (REFERENCE_CLASS_P (op)
2098                                          && get_base_address (op)))))
2099                           continue;
2100                       }
2101                   }
2102               }
2103             return res;
2104           }
2105         /* If dependence analysis will give up due to the limit on the
2106            number of datarefs stop here and fail fatally.  */
2107         if (datarefs->length ()
2108             > (unsigned)param_loop_max_datarefs_for_datadeps)
2109           return opt_result::failure_at (stmt, "exceeded param "
2110                                          "loop-max-datarefs-for-datadeps\n");
2111       }
2112   return opt_result::success ();
2113 }
2114
2115 /* Look for SLP-only access groups and turn each individual access into its own
2116    group.  */
2117 static void
2118 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2119 {
2120   unsigned int i;
2121   struct data_reference *dr;
2122
2123   DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2124
2125   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2126   FOR_EACH_VEC_ELT (datarefs, i, dr)
2127     {
2128       gcc_assert (DR_REF (dr));
2129       stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
2130
2131       /* Check if the load is a part of an interleaving chain.  */
2132       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2133         {
2134           stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2135           dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2136           unsigned int group_size = DR_GROUP_SIZE (first_element);
2137
2138           /* Check if SLP-only groups.  */
2139           if (!STMT_SLP_TYPE (stmt_info)
2140               && STMT_VINFO_SLP_VECT_ONLY (first_element))
2141             {
2142               /* Dissolve the group.  */
2143               STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2144
2145               stmt_vec_info vinfo = first_element;
2146               while (vinfo)
2147                 {
2148                   stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2149                   DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2150                   DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2151                   DR_GROUP_SIZE (vinfo) = 1;
2152                   if (STMT_VINFO_STRIDED_P (first_element))
2153                     DR_GROUP_GAP (vinfo) = 0;
2154                   else
2155                     DR_GROUP_GAP (vinfo) = group_size - 1;
2156                   /* Duplicate and adjust alignment info, it needs to
2157                      be present on each group leader, see dr_misalignment.  */
2158                   if (vinfo != first_element)
2159                     {
2160                       dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2161                       dr_info2->target_alignment = dr_info->target_alignment;
2162                       int misalignment = dr_info->misalignment;
2163                       if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2164                         {
2165                           HOST_WIDE_INT diff
2166                             = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2167                                - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2168                           unsigned HOST_WIDE_INT align_c
2169                             = dr_info->target_alignment.to_constant ();
2170                           misalignment = (misalignment + diff) % align_c;
2171                         }
2172                       dr_info2->misalignment = misalignment;
2173                     }
2174                   vinfo = next;
2175                 }
2176             }
2177         }
2178     }
2179 }
2180
2181 /* Determine if operating on full vectors for LOOP_VINFO might leave
2182    some scalar iterations still to do.  If so, decide how we should
2183    handle those scalar iterations.  The possibilities are:
2184
2185    (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2186        In this case:
2187
2188          LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2189          LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2190          LOOP_VINFO_PEELING_FOR_NITER == false
2191
2192    (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2193        to handle the remaining scalar iterations.  In this case:
2194
2195          LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2196          LOOP_VINFO_PEELING_FOR_NITER == true
2197
2198        There are two choices:
2199
2200        (2a) Consider vectorizing the epilogue loop at the same VF as the
2201             main loop, but using partial vectors instead of full vectors.
2202             In this case:
2203
2204               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2205
2206        (2b) Consider vectorizing the epilogue loop at lower VFs only.
2207             In this case:
2208
2209               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2210
2211    When FOR_EPILOGUE_P is true, make this determination based on the
2212    assumption that LOOP_VINFO is an epilogue loop, otherwise make it
2213    based on the assumption that LOOP_VINFO is the main loop.  The caller
2214    has made sure that the number of iterations is set appropriately for
2215    this value of FOR_EPILOGUE_P.  */
2216
2217 opt_result
2218 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo,
2219                                             bool for_epilogue_p)
2220 {
2221   /* Determine whether there would be any scalar iterations left over.  */
2222   bool need_peeling_or_partial_vectors_p
2223     = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2224
2225   /* Decide whether to vectorize the loop with partial vectors.  */
2226   LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2227   LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2228   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2229       && need_peeling_or_partial_vectors_p)
2230     {
2231       /* For partial-vector-usage=1, try to push the handling of partial
2232          vectors to the epilogue, with the main loop continuing to operate
2233          on full vectors.
2234
2235          If we are unrolling we also do not want to use partial vectors. This
2236          is to avoid the overhead of generating multiple masks and also to
2237          avoid having to execute entire iterations of FALSE masked instructions
2238          when dealing with one or less full iterations.
2239
2240          ??? We could then end up failing to use partial vectors if we
2241          decide to peel iterations into a prologue, and if the main loop
2242          then ends up processing fewer than VF iterations.  */
2243       if ((param_vect_partial_vector_usage == 1
2244            || loop_vinfo->suggested_unroll_factor > 1)
2245           && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2246           && !vect_known_niters_smaller_than_vf (loop_vinfo))
2247         LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2248       else
2249         LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2250     }
2251
2252   if (dump_enabled_p ())
2253     {
2254       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2255         dump_printf_loc (MSG_NOTE, vect_location,
2256                          "operating on partial vectors%s.\n",
2257                          for_epilogue_p ? " for epilogue loop" : "");
2258       else
2259         dump_printf_loc (MSG_NOTE, vect_location,
2260                          "operating only on full vectors%s.\n",
2261                          for_epilogue_p ? " for epilogue loop" : "");
2262     }
2263
2264   if (for_epilogue_p)
2265     {
2266       loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2267       gcc_assert (orig_loop_vinfo);
2268       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2269         gcc_assert (known_lt (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2270                               LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)));
2271     }
2272
2273   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2274       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2275     {
2276       /* Check that the loop processes at least one full vector.  */
2277       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2278       tree scalar_niters = LOOP_VINFO_NITERS (loop_vinfo);
2279       if (known_lt (wi::to_widest (scalar_niters), vf))
2280         return opt_result::failure_at (vect_location,
2281                                        "loop does not have enough iterations"
2282                                        " to support vectorization.\n");
2283
2284       /* If we need to peel an extra epilogue iteration to handle data
2285          accesses with gaps, check that there are enough scalar iterations
2286          available.
2287
2288          The check above is redundant with this one when peeling for gaps,
2289          but the distinction is useful for diagnostics.  */
2290       tree scalar_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
2291       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2292           && known_lt (wi::to_widest (scalar_nitersm1), vf))
2293         return opt_result::failure_at (vect_location,
2294                                        "loop does not have enough iterations"
2295                                        " to support peeling for gaps.\n");
2296     }
2297
2298   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2299     = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2300        && need_peeling_or_partial_vectors_p);
2301
2302   return opt_result::success ();
2303 }
2304
2305 /* Function vect_analyze_loop_2.
2306
2307    Apply a set of analyses on LOOP specified by LOOP_VINFO, the different
2308    analyses will record information in some members of LOOP_VINFO.  FATAL
2309    indicates if some analysis meets fatal error.  If one non-NULL pointer
2310    SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one
2311    worked out suggested unroll factor, while one NULL pointer shows it's
2312    going to apply the suggested unroll factor.  SLP_DONE_FOR_SUGGESTED_UF
2313    is to hold the slp decision when the suggested unroll factor is worked
2314    out.  */
2315 static opt_result
2316 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
2317                      unsigned *suggested_unroll_factor,
2318                      bool& slp_done_for_suggested_uf)
2319 {
2320   opt_result ok = opt_result::success ();
2321   int res;
2322   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2323   poly_uint64 min_vf = 2;
2324   loop_vec_info orig_loop_vinfo = NULL;
2325
2326   /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2327      loop_vec_info of the first vectorized loop.  */
2328   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2329     orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2330   else
2331     orig_loop_vinfo = loop_vinfo;
2332   gcc_assert (orig_loop_vinfo);
2333
2334   /* The first group of checks is independent of the vector size.  */
2335   fatal = true;
2336
2337   if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2338       && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2339     return opt_result::failure_at (vect_location,
2340                                    "not vectorized: simd if(0)\n");
2341
2342   /* Find all data references in the loop (which correspond to vdefs/vuses)
2343      and analyze their evolution in the loop.  */
2344
2345   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2346
2347   /* Gather the data references and count stmts in the loop.  */
2348   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2349     {
2350       opt_result res
2351         = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2352                                      &LOOP_VINFO_DATAREFS (loop_vinfo),
2353                                      &LOOP_VINFO_N_STMTS (loop_vinfo));
2354       if (!res)
2355         {
2356           if (dump_enabled_p ())
2357             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2358                              "not vectorized: loop contains function "
2359                              "calls or data references that cannot "
2360                              "be analyzed\n");
2361           return res;
2362         }
2363       loop_vinfo->shared->save_datarefs ();
2364     }
2365   else
2366     loop_vinfo->shared->check_datarefs ();
2367
2368   /* Analyze the data references and also adjust the minimal
2369      vectorization factor according to the loads and stores.  */
2370
2371   ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2372   if (!ok)
2373     {
2374       if (dump_enabled_p ())
2375         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2376                          "bad data references.\n");
2377       return ok;
2378     }
2379
2380   /* Check if we are applying unroll factor now.  */
2381   bool applying_suggested_uf = loop_vinfo->suggested_unroll_factor > 1;
2382   gcc_assert (!applying_suggested_uf || !suggested_unroll_factor);
2383
2384   /* If the slp decision is false when suggested unroll factor is worked
2385      out, and we are applying suggested unroll factor, we can simply skip
2386      all slp related analyses this time.  */
2387   bool slp = !applying_suggested_uf || slp_done_for_suggested_uf;
2388
2389   /* Classify all cross-iteration scalar data-flow cycles.
2390      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
2391   vect_analyze_scalar_cycles (loop_vinfo, slp);
2392
2393   vect_pattern_recog (loop_vinfo);
2394
2395   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2396
2397   /* Analyze the access patterns of the data-refs in the loop (consecutive,
2398      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
2399
2400   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2401   if (!ok)
2402     {
2403       if (dump_enabled_p ())
2404         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2405                          "bad data access.\n");
2406       return ok;
2407     }
2408
2409   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
2410
2411   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2412   if (!ok)
2413     {
2414       if (dump_enabled_p ())
2415         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2416                          "unexpected pattern.\n");
2417       return ok;
2418     }
2419
2420   /* While the rest of the analysis below depends on it in some way.  */
2421   fatal = false;
2422
2423   /* Analyze data dependences between the data-refs in the loop
2424      and adjust the maximum vectorization factor according to
2425      the dependences.
2426      FORNOW: fail at the first data dependence that we encounter.  */
2427
2428   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2429   if (!ok)
2430     {
2431       if (dump_enabled_p ())
2432         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2433                          "bad data dependence.\n");
2434       return ok;
2435     }
2436   if (max_vf != MAX_VECTORIZATION_FACTOR
2437       && maybe_lt (max_vf, min_vf))
2438     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2439   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2440
2441   ok = vect_determine_vectorization_factor (loop_vinfo);
2442   if (!ok)
2443     {
2444       if (dump_enabled_p ())
2445         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2446                          "can't determine vectorization factor.\n");
2447       return ok;
2448     }
2449   if (max_vf != MAX_VECTORIZATION_FACTOR
2450       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2451     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2452
2453   /* Compute the scalar iteration cost.  */
2454   vect_compute_single_scalar_iteration_cost (loop_vinfo);
2455
2456   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2457
2458   if (slp)
2459     {
2460       /* Check the SLP opportunities in the loop, analyze and build
2461          SLP trees.  */
2462       ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo));
2463       if (!ok)
2464         return ok;
2465
2466       /* If there are any SLP instances mark them as pure_slp.  */
2467       slp = vect_make_slp_decision (loop_vinfo);
2468       if (slp)
2469         {
2470           /* Find stmts that need to be both vectorized and SLPed.  */
2471           vect_detect_hybrid_slp (loop_vinfo);
2472
2473           /* Update the vectorization factor based on the SLP decision.  */
2474           vect_update_vf_for_slp (loop_vinfo);
2475
2476           /* Optimize the SLP graph with the vectorization factor fixed.  */
2477           vect_optimize_slp (loop_vinfo);
2478
2479           /* Gather the loads reachable from the SLP graph entries.  */
2480           vect_gather_slp_loads (loop_vinfo);
2481         }
2482     }
2483
2484   bool saved_can_use_partial_vectors_p
2485     = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2486
2487   /* We don't expect to have to roll back to anything other than an empty
2488      set of rgroups.  */
2489   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2490
2491   /* This is the point where we can re-start analysis with SLP forced off.  */
2492 start_over:
2493
2494   /* Apply the suggested unrolling factor, this was determined by the backend
2495      during finish_cost the first time we ran the analyzis for this
2496      vector mode.  */
2497   if (applying_suggested_uf)
2498     LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2499
2500   /* Now the vectorization factor is final.  */
2501   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2502   gcc_assert (known_ne (vectorization_factor, 0U));
2503
2504   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2505     {
2506       dump_printf_loc (MSG_NOTE, vect_location,
2507                        "vectorization_factor = ");
2508       dump_dec (MSG_NOTE, vectorization_factor);
2509       dump_printf (MSG_NOTE, ", niters = %wd\n",
2510                    LOOP_VINFO_INT_NITERS (loop_vinfo));
2511     }
2512
2513   loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2514
2515   /* Analyze the alignment of the data-refs in the loop.
2516      Fail if a data reference is found that cannot be vectorized.  */
2517
2518   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2519   if (!ok)
2520     {
2521       if (dump_enabled_p ())
2522         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2523                          "bad data alignment.\n");
2524       return ok;
2525     }
2526
2527   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2528      It is important to call pruning after vect_analyze_data_ref_accesses,
2529      since we use grouping information gathered by interleaving analysis.  */
2530   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2531   if (!ok)
2532     return ok;
2533
2534   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2535      vectorization, since we do not want to add extra peeling or
2536      add versioning for alignment.  */
2537   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2538     /* This pass will decide on using loop versioning and/or loop peeling in
2539        order to enhance the alignment of data references in the loop.  */
2540     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2541   if (!ok)
2542     return ok;
2543
2544   if (slp)
2545     {
2546       /* Analyze operations in the SLP instances.  Note this may
2547          remove unsupported SLP instances which makes the above
2548          SLP kind detection invalid.  */
2549       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2550       vect_slp_analyze_operations (loop_vinfo);
2551       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2552         {
2553           ok = opt_result::failure_at (vect_location,
2554                                        "unsupported SLP instances\n");
2555           goto again;
2556         }
2557
2558       /* Check whether any load in ALL SLP instances is possibly permuted.  */
2559       slp_tree load_node, slp_root;
2560       unsigned i, x;
2561       slp_instance instance;
2562       bool can_use_lanes = true;
2563       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2564         {
2565           slp_root = SLP_INSTANCE_TREE (instance);
2566           int group_size = SLP_TREE_LANES (slp_root);
2567           tree vectype = SLP_TREE_VECTYPE (slp_root);
2568           bool loads_permuted = false;
2569           FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2570             {
2571               if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2572                 continue;
2573               unsigned j;
2574               stmt_vec_info load_info;
2575               FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2576                 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2577                   {
2578                     loads_permuted = true;
2579                     break;
2580                   }
2581             }
2582
2583           /* If the loads and stores can be handled with load/store-lane
2584              instructions record it and move on to the next instance.  */
2585           if (loads_permuted
2586               && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2587               && vect_store_lanes_supported (vectype, group_size, false))
2588             {
2589               FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2590                 {
2591                   stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2592                       (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2593                   /* Use SLP for strided accesses (or if we can't
2594                      load-lanes).  */
2595                   if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2596                       || ! vect_load_lanes_supported
2597                             (STMT_VINFO_VECTYPE (stmt_vinfo),
2598                              DR_GROUP_SIZE (stmt_vinfo), false))
2599                     break;
2600                 }
2601
2602               can_use_lanes
2603                 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
2604
2605               if (can_use_lanes && dump_enabled_p ())
2606                 dump_printf_loc (MSG_NOTE, vect_location,
2607                                  "SLP instance %p can use load/store-lanes\n",
2608                                  (void *) instance);
2609             }
2610           else
2611             {
2612               can_use_lanes = false;
2613               break;
2614             }
2615         }
2616
2617       /* If all SLP instances can use load/store-lanes abort SLP and try again
2618          with SLP disabled.  */
2619       if (can_use_lanes)
2620         {
2621           ok = opt_result::failure_at (vect_location,
2622                                        "Built SLP cancelled: can use "
2623                                        "load/store-lanes\n");
2624           if (dump_enabled_p ())
2625             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2626                              "Built SLP cancelled: all SLP instances support "
2627                              "load/store-lanes\n");
2628           goto again;
2629         }
2630     }
2631
2632   /* Dissolve SLP-only groups.  */
2633   vect_dissolve_slp_only_groups (loop_vinfo);
2634
2635   /* Scan all the remaining operations in the loop that are not subject
2636      to SLP and make sure they are vectorizable.  */
2637   ok = vect_analyze_loop_operations (loop_vinfo);
2638   if (!ok)
2639     {
2640       if (dump_enabled_p ())
2641         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2642                          "bad operation or unsupported loop bound.\n");
2643       return ok;
2644     }
2645
2646   /* For now, we don't expect to mix both masking and length approaches for one
2647      loop, disable it if both are recorded.  */
2648   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2649       && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2650       && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2651     {
2652       if (dump_enabled_p ())
2653         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2654                          "can't vectorize a loop with partial vectors"
2655                          " because we don't expect to mix different"
2656                          " approaches with partial vectors for the"
2657                          " same loop.\n");
2658       LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2659     }
2660
2661   /* If we still have the option of using partial vectors,
2662      check whether we can generate the necessary loop controls.  */
2663   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2664       && !vect_verify_full_masking (loop_vinfo)
2665       && !vect_verify_loop_lens (loop_vinfo))
2666     LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2667
2668   /* If we're vectorizing an epilogue loop, the vectorized loop either needs
2669      to be able to handle fewer than VF scalars, or needs to have a lower VF
2670      than the main loop.  */
2671   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2672       && !LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2673       && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2674                    LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
2675     return opt_result::failure_at (vect_location,
2676                                    "Vectorization factor too high for"
2677                                    " epilogue loop.\n");
2678
2679   /* Decide whether this loop_vinfo should use partial vectors or peeling,
2680      assuming that the loop will be used as a main loop.  We will redo
2681      this analysis later if we instead decide to use the loop as an
2682      epilogue loop.  */
2683   ok = vect_determine_partial_vectors_and_peeling (loop_vinfo, false);
2684   if (!ok)
2685     return ok;
2686
2687   /* Check the costings of the loop make vectorizing worthwhile.  */
2688   res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
2689   if (res < 0)
2690     {
2691       ok = opt_result::failure_at (vect_location,
2692                                    "Loop costings may not be worthwhile.\n");
2693       goto again;
2694     }
2695   if (!res)
2696     return opt_result::failure_at (vect_location,
2697                                    "Loop costings not worthwhile.\n");
2698
2699   /* If an epilogue loop is required make sure we can create one.  */
2700   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2701       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2702     {
2703       if (dump_enabled_p ())
2704         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2705       if (!vect_can_advance_ivs_p (loop_vinfo)
2706           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2707                                            single_exit (LOOP_VINFO_LOOP
2708                                                          (loop_vinfo))))
2709         {
2710           ok = opt_result::failure_at (vect_location,
2711                                        "not vectorized: can't create required "
2712                                        "epilog loop\n");
2713           goto again;
2714         }
2715     }
2716
2717   /* During peeling, we need to check if number of loop iterations is
2718      enough for both peeled prolog loop and vector loop.  This check
2719      can be merged along with threshold check of loop versioning, so
2720      increase threshold for this case if necessary.
2721
2722      If we are analyzing an epilogue we still want to check what its
2723      versioning threshold would be.  If we decide to vectorize the epilogues we
2724      will want to use the lowest versioning threshold of all epilogues and main
2725      loop.  This will enable us to enter a vectorized epilogue even when
2726      versioning the loop.  We can't simply check whether the epilogue requires
2727      versioning though since we may have skipped some versioning checks when
2728      analyzing the epilogue.  For instance, checks for alias versioning will be
2729      skipped when dealing with epilogues as we assume we already checked them
2730      for the main loop.  So instead we always check the 'orig_loop_vinfo'.  */
2731   if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2732     {
2733       poly_uint64 niters_th = 0;
2734       unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2735
2736       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2737         {
2738           /* Niters for peeled prolog loop.  */
2739           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2740             {
2741               dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2742               tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2743               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2744             }
2745           else
2746             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2747         }
2748
2749       /* Niters for at least one iteration of vectorized loop.  */
2750       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2751         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2752       /* One additional iteration because of peeling for gap.  */
2753       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2754         niters_th += 1;
2755
2756       /*  Use the same condition as vect_transform_loop to decide when to use
2757           the cost to determine a versioning threshold.  */
2758       if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2759           && ordered_p (th, niters_th))
2760         niters_th = ordered_max (poly_uint64 (th), niters_th);
2761
2762       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2763     }
2764
2765   gcc_assert (known_eq (vectorization_factor,
2766                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2767
2768   slp_done_for_suggested_uf = slp;
2769
2770   /* Ok to vectorize!  */
2771   LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2772   return opt_result::success ();
2773
2774 again:
2775   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
2776   gcc_assert (!ok);
2777
2778   /* Try again with SLP forced off but if we didn't do any SLP there is
2779      no point in re-trying.  */
2780   if (!slp)
2781     return ok;
2782
2783   /* If the slp decision is true when suggested unroll factor is worked
2784      out, and we are applying suggested unroll factor, we don't need to
2785      re-try any more.  */
2786   if (applying_suggested_uf && slp_done_for_suggested_uf)
2787     return ok;
2788
2789   /* If there are reduction chains re-trying will fail anyway.  */
2790   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2791     return ok;
2792
2793   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2794      via interleaving or lane instructions.  */
2795   slp_instance instance;
2796   slp_tree node;
2797   unsigned i, j;
2798   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2799     {
2800       stmt_vec_info vinfo;
2801       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2802       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2803         continue;
2804       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2805       unsigned int size = DR_GROUP_SIZE (vinfo);
2806       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2807       if (! vect_store_lanes_supported (vectype, size, false)
2808          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2809          && ! vect_grouped_store_supported (vectype, size))
2810         return opt_result::failure_at (vinfo->stmt,
2811                                        "unsupported grouped store\n");
2812       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2813         {
2814           vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2815           vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2816           bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2817           size = DR_GROUP_SIZE (vinfo);
2818           vectype = STMT_VINFO_VECTYPE (vinfo);
2819           if (! vect_load_lanes_supported (vectype, size, false)
2820               && ! vect_grouped_load_supported (vectype, single_element_p,
2821                                                 size))
2822             return opt_result::failure_at (vinfo->stmt,
2823                                            "unsupported grouped load\n");
2824         }
2825     }
2826
2827   if (dump_enabled_p ())
2828     dump_printf_loc (MSG_NOTE, vect_location,
2829                      "re-trying with SLP disabled\n");
2830
2831   /* Roll back state appropriately.  No SLP this time.  */
2832   slp = false;
2833   /* Restore vectorization factor as it were without SLP.  */
2834   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2835   /* Free the SLP instances.  */
2836   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2837     vect_free_slp_instance (instance);
2838   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2839   /* Reset SLP type to loop_vect on all stmts.  */
2840   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2841     {
2842       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2843       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2844            !gsi_end_p (si); gsi_next (&si))
2845         {
2846           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2847           STMT_SLP_TYPE (stmt_info) = loop_vect;
2848           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2849               || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2850             {
2851               /* vectorizable_reduction adjusts reduction stmt def-types,
2852                  restore them to that of the PHI.  */
2853               STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2854                 = STMT_VINFO_DEF_TYPE (stmt_info);
2855               STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2856                                         (STMT_VINFO_REDUC_DEF (stmt_info)))
2857                 = STMT_VINFO_DEF_TYPE (stmt_info);
2858             }
2859         }
2860       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2861            !gsi_end_p (si); gsi_next (&si))
2862         {
2863           if (is_gimple_debug (gsi_stmt (si)))
2864             continue;
2865           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2866           STMT_SLP_TYPE (stmt_info) = loop_vect;
2867           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2868             {
2869               stmt_vec_info pattern_stmt_info
2870                 = STMT_VINFO_RELATED_STMT (stmt_info);
2871               if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
2872                 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
2873
2874               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2875               STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
2876               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2877                    !gsi_end_p (pi); gsi_next (&pi))
2878                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2879                   = loop_vect;
2880             }
2881         }
2882     }
2883   /* Free optimized alias test DDRS.  */
2884   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2885   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2886   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2887   /* Reset target cost data.  */
2888   delete loop_vinfo->vector_costs;
2889   loop_vinfo->vector_costs = nullptr;
2890   /* Reset accumulated rgroup information.  */
2891   release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo));
2892   release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
2893   /* Reset assorted flags.  */
2894   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2895   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2896   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2897   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2898   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2899     = saved_can_use_partial_vectors_p;
2900
2901   goto start_over;
2902 }
2903
2904 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2905    to be better than vectorizing it using OLD_LOOP_VINFO.  Assume that
2906    OLD_LOOP_VINFO is better unless something specifically indicates
2907    otherwise.
2908
2909    Note that this deliberately isn't a partial order.  */
2910
2911 static bool
2912 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2913                           loop_vec_info old_loop_vinfo)
2914 {
2915   struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2916   gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2917
2918   poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2919   poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2920
2921   /* Always prefer a VF of loop->simdlen over any other VF.  */
2922   if (loop->simdlen)
2923     {
2924       bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2925       bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2926       if (new_simdlen_p != old_simdlen_p)
2927         return new_simdlen_p;
2928     }
2929
2930   const auto *old_costs = old_loop_vinfo->vector_costs;
2931   const auto *new_costs = new_loop_vinfo->vector_costs;
2932   if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
2933     return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
2934
2935   return new_costs->better_main_loop_than_p (old_costs);
2936 }
2937
2938 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO.  Return
2939    true if we should.  */
2940
2941 static bool
2942 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
2943                         loop_vec_info old_loop_vinfo)
2944 {
2945   if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
2946     return false;
2947
2948   if (dump_enabled_p ())
2949     dump_printf_loc (MSG_NOTE, vect_location,
2950                      "***** Preferring vector mode %s to vector mode %s\n",
2951                      GET_MODE_NAME (new_loop_vinfo->vector_mode),
2952                      GET_MODE_NAME (old_loop_vinfo->vector_mode));
2953   return true;
2954 }
2955
2956 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
2957    not NULL.  Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
2958    MODE_I to the next mode useful to analyze.
2959    Return the loop_vinfo on success and wrapped null on failure.  */
2960
2961 static opt_loop_vec_info
2962 vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
2963                      const vect_loop_form_info *loop_form_info,
2964                      loop_vec_info main_loop_vinfo,
2965                      const vector_modes &vector_modes, unsigned &mode_i,
2966                      machine_mode &autodetected_vector_mode,
2967                      bool &fatal)
2968 {
2969   loop_vec_info loop_vinfo
2970     = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
2971
2972   machine_mode vector_mode = vector_modes[mode_i];
2973   loop_vinfo->vector_mode = vector_mode;
2974   unsigned int suggested_unroll_factor = 1;
2975   bool slp_done_for_suggested_uf;
2976
2977   /* Run the main analysis.  */
2978   opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,
2979                                         &suggested_unroll_factor,
2980                                         slp_done_for_suggested_uf);
2981   if (dump_enabled_p ())
2982     dump_printf_loc (MSG_NOTE, vect_location,
2983                      "***** Analysis %s with vector mode %s\n",
2984                      res ? "succeeded" : " failed",
2985                      GET_MODE_NAME (loop_vinfo->vector_mode));
2986
2987   if (!main_loop_vinfo && suggested_unroll_factor > 1)
2988     {
2989       if (dump_enabled_p ())
2990         dump_printf_loc (MSG_NOTE, vect_location,
2991                          "***** Re-trying analysis for unrolling"
2992                          " with unroll factor %d and slp %s.\n",
2993                          suggested_unroll_factor,
2994                          slp_done_for_suggested_uf ? "on" : "off");
2995       loop_vec_info unroll_vinfo
2996         = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
2997       unroll_vinfo->vector_mode = vector_mode;
2998       unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
2999       opt_result new_res = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL,
3000                                                 slp_done_for_suggested_uf);
3001       if (new_res)
3002         {
3003           delete loop_vinfo;
3004           loop_vinfo = unroll_vinfo;
3005         }
3006       else
3007         delete unroll_vinfo;
3008     }
3009
3010   /* Remember the autodetected vector mode.  */
3011   if (vector_mode == VOIDmode)
3012     autodetected_vector_mode = loop_vinfo->vector_mode;
3013
3014   /* Advance mode_i, first skipping modes that would result in the
3015      same analysis result.  */
3016   while (mode_i + 1 < vector_modes.length ()
3017          && vect_chooses_same_modes_p (loop_vinfo,
3018                                        vector_modes[mode_i + 1]))
3019     {
3020       if (dump_enabled_p ())
3021         dump_printf_loc (MSG_NOTE, vect_location,
3022                          "***** The result for vector mode %s would"
3023                          " be the same\n",
3024                          GET_MODE_NAME (vector_modes[mode_i + 1]));
3025       mode_i += 1;
3026     }
3027   if (mode_i + 1 < vector_modes.length ()
3028       && VECTOR_MODE_P (autodetected_vector_mode)
3029       && (related_vector_mode (vector_modes[mode_i + 1],
3030                                GET_MODE_INNER (autodetected_vector_mode))
3031           == autodetected_vector_mode)
3032       && (related_vector_mode (autodetected_vector_mode,
3033                                GET_MODE_INNER (vector_modes[mode_i + 1]))
3034           == vector_modes[mode_i + 1]))
3035     {
3036       if (dump_enabled_p ())
3037         dump_printf_loc (MSG_NOTE, vect_location,
3038                          "***** Skipping vector mode %s, which would"
3039                          " repeat the analysis for %s\n",
3040                          GET_MODE_NAME (vector_modes[mode_i + 1]),
3041                          GET_MODE_NAME (autodetected_vector_mode));
3042       mode_i += 1;
3043     }
3044   mode_i++;
3045
3046   if (!res)
3047     {
3048       delete loop_vinfo;
3049       if (fatal)
3050         gcc_checking_assert (main_loop_vinfo == NULL);
3051       return opt_loop_vec_info::propagate_failure (res);
3052     }
3053
3054   return opt_loop_vec_info::success (loop_vinfo);
3055 }
3056
3057 /* Function vect_analyze_loop.
3058
3059    Apply a set of analyses on LOOP, and create a loop_vec_info struct
3060    for it.  The different analyses will record information in the
3061    loop_vec_info struct.  */
3062 opt_loop_vec_info
3063 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
3064 {
3065   DUMP_VECT_SCOPE ("analyze_loop_nest");
3066
3067   if (loop_outer (loop)
3068       && loop_vec_info_for_loop (loop_outer (loop))
3069       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
3070     return opt_loop_vec_info::failure_at (vect_location,
3071                                           "outer-loop already vectorized.\n");
3072
3073   if (!find_loop_nest (loop, &shared->loop_nest))
3074     return opt_loop_vec_info::failure_at
3075       (vect_location,
3076        "not vectorized: loop nest containing two or more consecutive inner"
3077        " loops cannot be vectorized\n");
3078
3079   /* Analyze the loop form.  */
3080   vect_loop_form_info loop_form_info;
3081   opt_result res = vect_analyze_loop_form (loop, &loop_form_info);
3082   if (!res)
3083     {
3084       if (dump_enabled_p ())
3085         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3086                          "bad loop form.\n");
3087       return opt_loop_vec_info::propagate_failure (res);
3088     }
3089   if (!integer_onep (loop_form_info.assumptions))
3090     {
3091       /* We consider to vectorize this loop by versioning it under
3092          some assumptions.  In order to do this, we need to clear
3093          existing information computed by scev and niter analyzer.  */
3094       scev_reset_htab ();
3095       free_numbers_of_iterations_estimates (loop);
3096       /* Also set flag for this loop so that following scev and niter
3097          analysis are done under the assumptions.  */
3098       loop_constraint_set (loop, LOOP_C_FINITE);
3099     }
3100
3101   auto_vector_modes vector_modes;
3102   /* Autodetect first vector size we try.  */
3103   vector_modes.safe_push (VOIDmode);
3104   unsigned int autovec_flags
3105     = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
3106                                                     loop->simdlen != 0);
3107   bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
3108                              && !unlimited_cost_model (loop));
3109   machine_mode autodetected_vector_mode = VOIDmode;
3110   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
3111   unsigned int mode_i = 0;
3112   unsigned HOST_WIDE_INT simdlen = loop->simdlen;
3113
3114   /* Keep track of the VF for each mode.  Initialize all to 0 which indicates
3115      a mode has not been analyzed.  */
3116   auto_vec<poly_uint64, 8> cached_vf_per_mode;
3117   for (unsigned i = 0; i < vector_modes.length (); ++i)
3118     cached_vf_per_mode.safe_push (0);
3119
3120   /* First determine the main loop vectorization mode, either the first
3121      one that works, starting with auto-detecting the vector mode and then
3122      following the targets order of preference, or the one with the
3123      lowest cost if pick_lowest_cost_p.  */
3124   while (1)
3125     {
3126       bool fatal;
3127       unsigned int last_mode_i = mode_i;
3128       /* Set cached VF to -1 prior to analysis, which indicates a mode has
3129          failed.  */
3130       cached_vf_per_mode[last_mode_i] = -1;
3131       opt_loop_vec_info loop_vinfo
3132         = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3133                                NULL, vector_modes, mode_i,
3134                                autodetected_vector_mode, fatal);
3135       if (fatal)
3136         break;
3137
3138       if (loop_vinfo)
3139         {
3140           /*  Analyzis has been successful so update the VF value.  The
3141               VF should always be a multiple of unroll_factor and we want to
3142               capture the original VF here.  */
3143           cached_vf_per_mode[last_mode_i]
3144             = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
3145                          loop_vinfo->suggested_unroll_factor);
3146           /* Once we hit the desired simdlen for the first time,
3147              discard any previous attempts.  */
3148           if (simdlen
3149               && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3150             {
3151               delete first_loop_vinfo;
3152               first_loop_vinfo = opt_loop_vec_info::success (NULL);
3153               simdlen = 0;
3154             }
3155           else if (pick_lowest_cost_p
3156                    && first_loop_vinfo
3157                    && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3158             {
3159               /* Pick loop_vinfo over first_loop_vinfo.  */
3160               delete first_loop_vinfo;
3161               first_loop_vinfo = opt_loop_vec_info::success (NULL);
3162             }
3163           if (first_loop_vinfo == NULL)
3164             first_loop_vinfo = loop_vinfo;
3165           else
3166             {
3167               delete loop_vinfo;
3168               loop_vinfo = opt_loop_vec_info::success (NULL);
3169             }
3170
3171           /* Commit to first_loop_vinfo if we have no reason to try
3172              alternatives.  */
3173           if (!simdlen && !pick_lowest_cost_p)
3174             break;
3175         }
3176       if (mode_i == vector_modes.length ()
3177           || autodetected_vector_mode == VOIDmode)
3178         break;
3179
3180       /* Try the next biggest vector size.  */
3181       if (dump_enabled_p ())
3182         dump_printf_loc (MSG_NOTE, vect_location,
3183                          "***** Re-trying analysis with vector mode %s\n",
3184                          GET_MODE_NAME (vector_modes[mode_i]));
3185     }
3186   if (!first_loop_vinfo)
3187     return opt_loop_vec_info::propagate_failure (res);
3188
3189   if (dump_enabled_p ())
3190     dump_printf_loc (MSG_NOTE, vect_location,
3191                      "***** Choosing vector mode %s\n",
3192                      GET_MODE_NAME (first_loop_vinfo->vector_mode));
3193
3194   /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3195      enabled, SIMDUID is not set, it is the innermost loop and we have
3196      either already found the loop's SIMDLEN or there was no SIMDLEN to
3197      begin with.
3198      TODO: Enable epilogue vectorization for loops with SIMDUID set.  */
3199   bool vect_epilogues = (!simdlen
3200                          && loop->inner == NULL
3201                          && param_vect_epilogues_nomask
3202                          && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3203                          && !loop->simduid);
3204   if (!vect_epilogues)
3205     return first_loop_vinfo;
3206
3207   /* Now analyze first_loop_vinfo for epilogue vectorization.  */
3208   poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3209
3210   /* For epilogues start the analysis from the first mode.  The motivation
3211      behind starting from the beginning comes from cases where the VECTOR_MODES
3212      array may contain length-agnostic and length-specific modes.  Their
3213      ordering is not guaranteed, so we could end up picking a mode for the main
3214      loop that is after the epilogue's optimal mode.  */
3215   vector_modes[0] = autodetected_vector_mode;
3216   mode_i = 0;
3217
3218   bool supports_partial_vectors =
3219     partial_vectors_supported_p () && param_vect_partial_vector_usage != 0;
3220   poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3221
3222   while (1)
3223     {
3224       /* If the target does not support partial vectors we can shorten the
3225          number of modes to analyze for the epilogue as we know we can't pick a
3226          mode that would lead to a VF at least as big as the
3227          FIRST_VINFO_VF.  */
3228       if (!supports_partial_vectors
3229           && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3230         {
3231           mode_i++;
3232           if (mode_i == vector_modes.length ())
3233             break;
3234           continue;
3235         }
3236
3237       if (dump_enabled_p ())
3238         dump_printf_loc (MSG_NOTE, vect_location,
3239                          "***** Re-trying epilogue analysis with vector "
3240                          "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3241
3242       bool fatal;
3243       opt_loop_vec_info loop_vinfo
3244         = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3245                                first_loop_vinfo,
3246                                vector_modes, mode_i,
3247                                autodetected_vector_mode, fatal);
3248       if (fatal)
3249         break;
3250
3251       if (loop_vinfo)
3252         {
3253           if (pick_lowest_cost_p)
3254             {
3255               /* Keep trying to roll back vectorization attempts while the
3256                  loop_vec_infos they produced were worse than this one.  */
3257               vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3258               while (!vinfos.is_empty ()
3259                      && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3260                 {
3261                   gcc_assert (vect_epilogues);
3262                   delete vinfos.pop ();
3263                 }
3264             }
3265           /* For now only allow one epilogue loop.  */
3266           if (first_loop_vinfo->epilogue_vinfos.is_empty ())
3267             {
3268               first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3269               poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3270               gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3271                           || maybe_ne (lowest_th, 0U));
3272               /* Keep track of the known smallest versioning
3273                  threshold.  */
3274               if (ordered_p (lowest_th, th))
3275                 lowest_th = ordered_min (lowest_th, th);
3276             }
3277           else
3278             {
3279               delete loop_vinfo;
3280               loop_vinfo = opt_loop_vec_info::success (NULL);
3281             }
3282
3283           /* For now only allow one epilogue loop, but allow
3284              pick_lowest_cost_p to replace it, so commit to the
3285              first epilogue if we have no reason to try alternatives.  */
3286           if (!pick_lowest_cost_p)
3287             break;
3288         }
3289
3290       if (mode_i == vector_modes.length ())
3291         break;
3292
3293     }
3294
3295   if (!first_loop_vinfo->epilogue_vinfos.is_empty ())
3296     {
3297       LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3298       if (dump_enabled_p ())
3299         dump_printf_loc (MSG_NOTE, vect_location,
3300                          "***** Choosing epilogue vector mode %s\n",
3301                          GET_MODE_NAME
3302                            (first_loop_vinfo->epilogue_vinfos[0]->vector_mode));
3303     }
3304
3305   return first_loop_vinfo;
3306 }
3307
3308 /* Return true if there is an in-order reduction function for CODE, storing
3309    it in *REDUC_FN if so.  */
3310
3311 static bool
3312 fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3313 {
3314   if (code == PLUS_EXPR)
3315     {
3316       *reduc_fn = IFN_FOLD_LEFT_PLUS;
3317       return true;
3318     }
3319   return false;
3320 }
3321
3322 /* Function reduction_fn_for_scalar_code
3323
3324    Input:
3325    CODE - tree_code of a reduction operations.
3326
3327    Output:
3328    REDUC_FN - the corresponding internal function to be used to reduce the
3329       vector of partial results into a single scalar result, or IFN_LAST
3330       if the operation is a supported reduction operation, but does not have
3331       such an internal function.
3332
3333    Return FALSE if CODE currently cannot be vectorized as reduction.  */
3334
3335 bool
3336 reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3337 {
3338   if (code.is_tree_code ())
3339     switch (tree_code (code))
3340       {
3341       case MAX_EXPR:
3342         *reduc_fn = IFN_REDUC_MAX;
3343         return true;
3344
3345       case MIN_EXPR:
3346         *reduc_fn = IFN_REDUC_MIN;
3347         return true;
3348
3349       case PLUS_EXPR:
3350         *reduc_fn = IFN_REDUC_PLUS;
3351         return true;
3352
3353       case BIT_AND_EXPR:
3354         *reduc_fn = IFN_REDUC_AND;
3355         return true;
3356
3357       case BIT_IOR_EXPR:
3358         *reduc_fn = IFN_REDUC_IOR;
3359         return true;
3360
3361       case BIT_XOR_EXPR:
3362         *reduc_fn = IFN_REDUC_XOR;
3363         return true;
3364
3365       case MULT_EXPR:
3366       case MINUS_EXPR:
3367         *reduc_fn = IFN_LAST;
3368         return true;
3369
3370       default:
3371         return false;
3372       }
3373   else
3374     switch (combined_fn (code))
3375       {
3376       CASE_CFN_FMAX:
3377         *reduc_fn = IFN_REDUC_FMAX;
3378         return true;
3379
3380       CASE_CFN_FMIN:
3381         *reduc_fn = IFN_REDUC_FMIN;
3382         return true;
3383
3384       default:
3385         return false;
3386       }
3387 }
3388
3389 /* If there is a neutral value X such that a reduction would not be affected
3390    by the introduction of additional X elements, return that X, otherwise
3391    return null.  CODE is the code of the reduction and SCALAR_TYPE is type
3392    of the scalar elements.  If the reduction has just a single initial value
3393    then INITIAL_VALUE is that value, otherwise it is null.  */
3394
3395 tree
3396 neutral_op_for_reduction (tree scalar_type, code_helper code,
3397                           tree initial_value)
3398 {
3399   if (code.is_tree_code ())
3400     switch (tree_code (code))
3401       {
3402       case WIDEN_SUM_EXPR:
3403       case DOT_PROD_EXPR:
3404       case SAD_EXPR:
3405       case PLUS_EXPR:
3406       case MINUS_EXPR:
3407       case BIT_IOR_EXPR:
3408       case BIT_XOR_EXPR:
3409         return build_zero_cst (scalar_type);
3410
3411       case MULT_EXPR:
3412         return build_one_cst (scalar_type);
3413
3414       case BIT_AND_EXPR:
3415         return build_all_ones_cst (scalar_type);
3416
3417       case MAX_EXPR:
3418       case MIN_EXPR:
3419         return initial_value;
3420
3421       default:
3422         return NULL_TREE;
3423       }
3424   else
3425     switch (combined_fn (code))
3426       {
3427       CASE_CFN_FMIN:
3428       CASE_CFN_FMAX:
3429         return initial_value;
3430
3431       default:
3432         return NULL_TREE;
3433       }
3434 }
3435
3436 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
3437    STMT is printed with a message MSG. */
3438
3439 static void
3440 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3441 {
3442   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3443 }
3444
3445 /* Return true if we need an in-order reduction for operation CODE
3446    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3447    overflow must wrap.  */
3448
3449 bool
3450 needs_fold_left_reduction_p (tree type, code_helper code)
3451 {
3452   /* CHECKME: check for !flag_finite_math_only too?  */
3453   if (SCALAR_FLOAT_TYPE_P (type))
3454     {
3455       if (code.is_tree_code ())
3456         switch (tree_code (code))
3457           {
3458           case MIN_EXPR:
3459           case MAX_EXPR:
3460             return false;
3461
3462           default:
3463             return !flag_associative_math;
3464           }
3465       else
3466         switch (combined_fn (code))
3467           {
3468           CASE_CFN_FMIN:
3469           CASE_CFN_FMAX:
3470             return false;
3471
3472           default:
3473             return !flag_associative_math;
3474           }
3475     }
3476
3477   if (INTEGRAL_TYPE_P (type))
3478     return (!code.is_tree_code ()
3479             || !operation_no_trapping_overflow (type, tree_code (code)));
3480
3481   if (SAT_FIXED_POINT_TYPE_P (type))
3482     return true;
3483
3484   return false;
3485 }
3486
3487 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3488    has a handled computation expression.  Store the main reduction
3489    operation in *CODE.  */
3490
3491 static bool
3492 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3493                       tree loop_arg, code_helper *code,
3494                       vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3495 {
3496   auto_bitmap visited;
3497   tree lookfor = PHI_RESULT (phi);
3498   ssa_op_iter curri;
3499   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3500   while (USE_FROM_PTR (curr) != loop_arg)
3501     curr = op_iter_next_use (&curri);
3502   curri.i = curri.numops;
3503   do
3504     {
3505       path.safe_push (std::make_pair (curri, curr));
3506       tree use = USE_FROM_PTR (curr);
3507       if (use == lookfor)
3508         break;
3509       gimple *def = SSA_NAME_DEF_STMT (use);
3510       if (gimple_nop_p (def)
3511           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3512         {
3513 pop:
3514           do
3515             {
3516               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3517               curri = x.first;
3518               curr = x.second;
3519               do
3520                 curr = op_iter_next_use (&curri);
3521               /* Skip already visited or non-SSA operands (from iterating
3522                  over PHI args).  */
3523               while (curr != NULL_USE_OPERAND_P
3524                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3525                          || ! bitmap_set_bit (visited,
3526                                               SSA_NAME_VERSION
3527                                                 (USE_FROM_PTR (curr)))));
3528             }
3529           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3530           if (curr == NULL_USE_OPERAND_P)
3531             break;
3532         }
3533       else
3534         {
3535           if (gimple_code (def) == GIMPLE_PHI)
3536             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3537           else
3538             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3539           while (curr != NULL_USE_OPERAND_P
3540                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3541                      || ! bitmap_set_bit (visited,
3542                                           SSA_NAME_VERSION
3543                                             (USE_FROM_PTR (curr)))))
3544             curr = op_iter_next_use (&curri);
3545           if (curr == NULL_USE_OPERAND_P)
3546             goto pop;
3547         }
3548     }
3549   while (1);
3550   if (dump_file && (dump_flags & TDF_DETAILS))
3551     {
3552       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3553       unsigned i;
3554       std::pair<ssa_op_iter, use_operand_p> *x;
3555       FOR_EACH_VEC_ELT (path, i, x)
3556         dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3557       dump_printf (MSG_NOTE, "\n");
3558     }
3559
3560   /* Check whether the reduction path detected is valid.  */
3561   bool fail = path.length () == 0;
3562   bool neg = false;
3563   int sign = -1;
3564   *code = ERROR_MARK;
3565   for (unsigned i = 1; i < path.length (); ++i)
3566     {
3567       gimple *use_stmt = USE_STMT (path[i].second);
3568       gimple_match_op op;
3569       if (!gimple_extract_op (use_stmt, &op))
3570         {
3571           fail = true;
3572           break;
3573         }
3574       unsigned int opi = op.num_ops;
3575       if (gassign *assign = dyn_cast<gassign *> (use_stmt))
3576         {
3577           /* The following make sure we can compute the operand index
3578              easily plus it mostly disallows chaining via COND_EXPR condition
3579              operands.  */
3580           for (opi = 0; opi < op.num_ops; ++opi)
3581             if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
3582               break;
3583         }
3584       else if (gcall *call = dyn_cast<gcall *> (use_stmt))
3585         {
3586           for (opi = 0; opi < op.num_ops; ++opi)
3587             if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
3588               break;
3589         }
3590       if (opi == op.num_ops)
3591         {
3592           fail = true;
3593           break;
3594         }
3595       op.code = canonicalize_code (op.code, op.type);
3596       if (op.code == MINUS_EXPR)
3597         {
3598           op.code = PLUS_EXPR;
3599           /* Track whether we negate the reduction value each iteration.  */
3600           if (op.ops[1] == op.ops[opi])
3601             neg = ! neg;
3602         }
3603       if (CONVERT_EXPR_CODE_P (op.code)
3604           && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
3605         ;
3606       else if (*code == ERROR_MARK)
3607         {
3608           *code = op.code;
3609           sign = TYPE_SIGN (op.type);
3610         }
3611       else if (op.code != *code)
3612         {
3613           fail = true;
3614           break;
3615         }
3616       else if ((op.code == MIN_EXPR
3617                 || op.code == MAX_EXPR)
3618                && sign != TYPE_SIGN (op.type))
3619         {
3620           fail = true;
3621           break;
3622         }
3623       /* Check there's only a single stmt the op is used on.  For the
3624          not value-changing tail and the last stmt allow out-of-loop uses.
3625          ???  We could relax this and handle arbitrary live stmts by
3626          forcing a scalar epilogue for example.  */
3627       imm_use_iterator imm_iter;
3628       gimple *op_use_stmt;
3629       unsigned cnt = 0;
3630       FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
3631         if (!is_gimple_debug (op_use_stmt)
3632             && (*code != ERROR_MARK
3633                 || flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))))
3634           {
3635             /* We want to allow x + x but not x < 1 ? x : 2.  */
3636             if (is_gimple_assign (op_use_stmt)
3637                 && gimple_assign_rhs_code (op_use_stmt) == COND_EXPR)
3638               {
3639                 use_operand_p use_p;
3640                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3641                   cnt++;
3642               }
3643             else
3644               cnt++;
3645           }
3646       if (cnt != 1)
3647         {
3648           fail = true;
3649           break;
3650         }
3651     }
3652   return ! fail && ! neg && *code != ERROR_MARK;
3653 }
3654
3655 bool
3656 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3657                       tree loop_arg, enum tree_code code)
3658 {
3659   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3660   code_helper code_;
3661   return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
3662           && code_ == code);
3663 }
3664
3665
3666
3667 /* Function vect_is_simple_reduction
3668
3669    (1) Detect a cross-iteration def-use cycle that represents a simple
3670    reduction computation.  We look for the following pattern:
3671
3672    loop_header:
3673      a1 = phi < a0, a2 >
3674      a3 = ...
3675      a2 = operation (a3, a1)
3676
3677    or
3678
3679    a3 = ...
3680    loop_header:
3681      a1 = phi < a0, a2 >
3682      a2 = operation (a3, a1)
3683
3684    such that:
3685    1. operation is commutative and associative and it is safe to
3686       change the order of the computation
3687    2. no uses for a2 in the loop (a2 is used out of the loop)
3688    3. no uses of a1 in the loop besides the reduction operation
3689    4. no uses of a1 outside the loop.
3690
3691    Conditions 1,4 are tested here.
3692    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3693
3694    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3695    nested cycles.
3696
3697    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3698    reductions:
3699
3700      a1 = phi < a0, a2 >
3701      inner loop (def of a3)
3702      a2 = phi < a3 >
3703
3704    (4) Detect condition expressions, ie:
3705      for (int i = 0; i < N; i++)
3706        if (a[i] < val)
3707         ret_val = a[i];
3708
3709 */
3710
3711 static stmt_vec_info
3712 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3713                           bool *double_reduc, bool *reduc_chain_p, bool slp)
3714 {
3715   gphi *phi = as_a <gphi *> (phi_info->stmt);
3716   gimple *phi_use_stmt = NULL;
3717   imm_use_iterator imm_iter;
3718   use_operand_p use_p;
3719
3720   *double_reduc = false;
3721   *reduc_chain_p = false;
3722   STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
3723
3724   tree phi_name = PHI_RESULT (phi);
3725   /* ???  If there are no uses of the PHI result the inner loop reduction
3726      won't be detected as possibly double-reduction by vectorizable_reduction
3727      because that tries to walk the PHI arg from the preheader edge which
3728      can be constant.  See PR60382.  */
3729   if (has_zero_uses (phi_name))
3730     return NULL;
3731   class loop *loop = (gimple_bb (phi))->loop_father;
3732   unsigned nphi_def_loop_uses = 0;
3733   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3734     {
3735       gimple *use_stmt = USE_STMT (use_p);
3736       if (is_gimple_debug (use_stmt))
3737         continue;
3738
3739       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3740         {
3741           if (dump_enabled_p ())
3742             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3743                              "intermediate value used outside loop.\n");
3744
3745           return NULL;
3746         }
3747
3748       nphi_def_loop_uses++;
3749       phi_use_stmt = use_stmt;
3750     }
3751
3752   tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
3753   if (TREE_CODE (latch_def) != SSA_NAME)
3754     {
3755       if (dump_enabled_p ())
3756         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3757                          "reduction: not ssa_name: %T\n", latch_def);
3758       return NULL;
3759     }
3760
3761   stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
3762   if (!def_stmt_info
3763       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3764     return NULL;
3765
3766   bool nested_in_vect_loop
3767     = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
3768   unsigned nlatch_def_loop_uses = 0;
3769   auto_vec<gphi *, 3> lcphis;
3770   bool inner_loop_of_double_reduc = false;
3771   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
3772     {
3773       gimple *use_stmt = USE_STMT (use_p);
3774       if (is_gimple_debug (use_stmt))
3775         continue;
3776       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3777         nlatch_def_loop_uses++;
3778       else
3779         {
3780           /* We can have more than one loop-closed PHI.  */
3781           lcphis.safe_push (as_a <gphi *> (use_stmt));
3782           if (nested_in_vect_loop
3783               && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
3784                   == vect_double_reduction_def))
3785             inner_loop_of_double_reduc = true;
3786         }
3787     }
3788
3789   /* If we are vectorizing an inner reduction we are executing that
3790      in the original order only in case we are not dealing with a
3791      double reduction.  */
3792   if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3793     {
3794       if (dump_enabled_p ())
3795         report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3796                         "detected nested cycle: ");
3797       return def_stmt_info;
3798     }
3799
3800   /* When the inner loop of a double reduction ends up with more than
3801      one loop-closed PHI we have failed to classify alternate such
3802      PHIs as double reduction, leading to wrong code.  See PR103237.  */
3803   if (inner_loop_of_double_reduc && lcphis.length () != 1)
3804     {
3805       if (dump_enabled_p ())
3806         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3807                          "unhandle double reduction\n");
3808       return NULL;
3809     }
3810
3811   /* If this isn't a nested cycle or if the nested cycle reduction value
3812      is used ouside of the inner loop we cannot handle uses of the reduction
3813      value.  */
3814   if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3815     {
3816       if (dump_enabled_p ())
3817         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3818                          "reduction used in loop.\n");
3819       return NULL;
3820     }
3821
3822   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3823      defined in the inner loop.  */
3824   if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3825     {
3826       tree op1 = PHI_ARG_DEF (def_stmt, 0);
3827       if (gimple_phi_num_args (def_stmt) != 1
3828           || TREE_CODE (op1) != SSA_NAME)
3829         {
3830           if (dump_enabled_p ())
3831             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3832                              "unsupported phi node definition.\n");
3833
3834           return NULL;
3835         }
3836
3837       gimple *def1 = SSA_NAME_DEF_STMT (op1);
3838       if (gimple_bb (def1)
3839           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3840           && loop->inner
3841           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3842           && (is_gimple_assign (def1) || is_gimple_call (def1))
3843           && is_a <gphi *> (phi_use_stmt)
3844           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3845         {
3846           if (dump_enabled_p ())
3847             report_vect_op (MSG_NOTE, def_stmt,
3848                             "detected double reduction: ");
3849
3850           *double_reduc = true;
3851           return def_stmt_info;
3852         }
3853
3854       return NULL;
3855     }
3856
3857   /* Look for the expression computing latch_def from then loop PHI result.  */
3858   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3859   code_helper code;
3860   if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3861                             path))
3862     {
3863       STMT_VINFO_REDUC_CODE (phi_info) = code;
3864       if (code == COND_EXPR && !nested_in_vect_loop)
3865         STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3866
3867       /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
3868          reduction chain for which the additional restriction is that
3869          all operations in the chain are the same.  */
3870       auto_vec<stmt_vec_info, 8> reduc_chain;
3871       unsigned i;
3872       bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
3873       for (i = path.length () - 1; i >= 1; --i)
3874         {
3875           gimple *stmt = USE_STMT (path[i].second);
3876           stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3877           gimple_match_op op;
3878           if (!gimple_extract_op (stmt, &op))
3879             gcc_unreachable ();
3880           if (gassign *assign = dyn_cast<gassign *> (stmt))
3881             STMT_VINFO_REDUC_IDX (stmt_info)
3882               = path[i].second->use - gimple_assign_rhs1_ptr (assign);
3883           else
3884             {
3885               gcall *call = as_a<gcall *> (stmt);
3886               STMT_VINFO_REDUC_IDX (stmt_info)
3887                 = path[i].second->use - gimple_call_arg_ptr (call, 0);
3888             }
3889           bool leading_conversion = (CONVERT_EXPR_CODE_P (op.code)
3890                                      && (i == 1 || i == path.length () - 1));
3891           if ((op.code != code && !leading_conversion)
3892               /* We can only handle the final value in epilogue
3893                  generation for reduction chains.  */
3894               || (i != 1 && !has_single_use (gimple_get_lhs (stmt))))
3895             is_slp_reduc = false;
3896           /* For reduction chains we support a trailing/leading
3897              conversions.  We do not store those in the actual chain.  */
3898           if (leading_conversion)
3899             continue;
3900           reduc_chain.safe_push (stmt_info);
3901         }
3902       if (slp && is_slp_reduc && reduc_chain.length () > 1)
3903         {
3904           for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
3905             {
3906               REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
3907               REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
3908             }
3909           REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
3910           REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
3911
3912           /* Save the chain for further analysis in SLP detection.  */
3913           LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
3914           REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
3915
3916           *reduc_chain_p = true;
3917           if (dump_enabled_p ())
3918             dump_printf_loc (MSG_NOTE, vect_location,
3919                             "reduction: detected reduction chain\n");
3920         }
3921       else if (dump_enabled_p ())
3922         dump_printf_loc (MSG_NOTE, vect_location,
3923                          "reduction: detected reduction\n");
3924
3925       return def_stmt_info;
3926     }
3927
3928   if (dump_enabled_p ())
3929     dump_printf_loc (MSG_NOTE, vect_location,
3930                      "reduction: unknown pattern\n");
3931
3932   return NULL;
3933 }
3934
3935 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
3936    PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
3937    or -1 if not known.  */
3938
3939 static int
3940 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
3941 {
3942   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3943   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
3944     {
3945       if (dump_enabled_p ())
3946         dump_printf_loc (MSG_NOTE, vect_location,
3947                          "cost model: epilogue peel iters set to vf/2 "
3948                          "because loop iterations are unknown .\n");
3949       return assumed_vf / 2;
3950     }
3951   else
3952     {
3953       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3954       peel_iters_prologue = MIN (niters, peel_iters_prologue);
3955       int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3956       /* If we need to peel for gaps, but no peeling is required, we have to
3957          peel VF iterations.  */
3958       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
3959         peel_iters_epilogue = assumed_vf;
3960       return peel_iters_epilogue;
3961     }
3962 }
3963
3964 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3965 int
3966 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3967                              int *peel_iters_epilogue,
3968                              stmt_vector_for_cost *scalar_cost_vec,
3969                              stmt_vector_for_cost *prologue_cost_vec,
3970                              stmt_vector_for_cost *epilogue_cost_vec)
3971 {
3972   int retval = 0;
3973
3974   *peel_iters_epilogue
3975     = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
3976
3977   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3978     {
3979       /* If peeled iterations are known but number of scalar loop
3980          iterations are unknown, count a taken branch per peeled loop.  */
3981       if (peel_iters_prologue > 0)
3982         retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3983                                    vect_prologue);
3984       if (*peel_iters_epilogue > 0)
3985         retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
3986                                     vect_epilogue);
3987     }
3988
3989   stmt_info_for_cost *si;
3990   int j;
3991   if (peel_iters_prologue)
3992     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3993       retval += record_stmt_cost (prologue_cost_vec,
3994                                   si->count * peel_iters_prologue,
3995                                   si->kind, si->stmt_info, si->misalign,
3996                                   vect_prologue);
3997   if (*peel_iters_epilogue)
3998     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3999       retval += record_stmt_cost (epilogue_cost_vec,
4000                                   si->count * *peel_iters_epilogue,
4001                                   si->kind, si->stmt_info, si->misalign,
4002                                   vect_epilogue);
4003
4004   return retval;
4005 }
4006
4007 /* Function vect_estimate_min_profitable_iters
4008
4009    Return the number of iterations required for the vector version of the
4010    loop to be profitable relative to the cost of the scalar version of the
4011    loop.
4012
4013    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
4014    of iterations for vectorization.  -1 value means loop vectorization
4015    is not profitable.  This returned value may be used for dynamic
4016    profitability check.
4017
4018    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
4019    for static check against estimated number of iterations.  */
4020
4021 static void
4022 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
4023                                     int *ret_min_profitable_niters,
4024                                     int *ret_min_profitable_estimate,
4025                                     unsigned *suggested_unroll_factor)
4026 {
4027   int min_profitable_iters;
4028   int min_profitable_estimate;
4029   int peel_iters_prologue;
4030   int peel_iters_epilogue;
4031   unsigned vec_inside_cost = 0;
4032   int vec_outside_cost = 0;
4033   unsigned vec_prologue_cost = 0;
4034   unsigned vec_epilogue_cost = 0;
4035   int scalar_single_iter_cost = 0;
4036   int scalar_outside_cost = 0;
4037   int assumed_vf = vect_vf_for_cost (loop_vinfo);
4038   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
4039   vector_costs *target_cost_data = loop_vinfo->vector_costs;
4040
4041   /* Cost model disabled.  */
4042   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
4043     {
4044       if (dump_enabled_p ())
4045         dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
4046       *ret_min_profitable_niters = 0;
4047       *ret_min_profitable_estimate = 0;
4048       return;
4049     }
4050
4051   /* Requires loop versioning tests to handle misalignment.  */
4052   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
4053     {
4054       /*  FIXME: Make cost depend on complexity of individual check.  */
4055       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
4056       (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4057       if (dump_enabled_p ())
4058         dump_printf (MSG_NOTE,
4059                      "cost model: Adding cost of checks for loop "
4060                      "versioning to treat misalignment.\n");
4061     }
4062
4063   /* Requires loop versioning with alias checks.  */
4064   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4065     {
4066       /*  FIXME: Make cost depend on complexity of individual check.  */
4067       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
4068       (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4069       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
4070       if (len)
4071         /* Count LEN - 1 ANDs and LEN comparisons.  */
4072         (void) add_stmt_cost (target_cost_data, len * 2 - 1,
4073                               scalar_stmt, vect_prologue);
4074       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
4075       if (len)
4076         {
4077           /* Count LEN - 1 ANDs and LEN comparisons.  */
4078           unsigned int nstmts = len * 2 - 1;
4079           /* +1 for each bias that needs adding.  */
4080           for (unsigned int i = 0; i < len; ++i)
4081             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
4082               nstmts += 1;
4083           (void) add_stmt_cost (target_cost_data, nstmts,
4084                                 scalar_stmt, vect_prologue);
4085         }
4086       if (dump_enabled_p ())
4087         dump_printf (MSG_NOTE,
4088                      "cost model: Adding cost of checks for loop "
4089                      "versioning aliasing.\n");
4090     }
4091
4092   /* Requires loop versioning with niter checks.  */
4093   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
4094     {
4095       /*  FIXME: Make cost depend on complexity of individual check.  */
4096       (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
4097                             NULL, NULL, NULL_TREE, 0, vect_prologue);
4098       if (dump_enabled_p ())
4099         dump_printf (MSG_NOTE,
4100                      "cost model: Adding cost of checks for loop "
4101                      "versioning niters.\n");
4102     }
4103
4104   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4105     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4106                           vect_prologue);
4107
4108   /* Count statements in scalar loop.  Using this as scalar cost for a single
4109      iteration for now.
4110
4111      TODO: Add outer loop support.
4112
4113      TODO: Consider assigning different costs to different scalar
4114      statements.  */
4115
4116   scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost ();
4117
4118   /* Add additional cost for the peeled instructions in prologue and epilogue
4119      loop.  (For fully-masked loops there will be no peeling.)
4120
4121      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4122      at compile-time - we assume it's vf/2 (the worst would be vf-1).
4123
4124      TODO: Build an expression that represents peel_iters for prologue and
4125      epilogue to be used in a run-time test.  */
4126
4127   bool prologue_need_br_taken_cost = false;
4128   bool prologue_need_br_not_taken_cost = false;
4129
4130   /* Calculate peel_iters_prologue.  */
4131   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4132     peel_iters_prologue = 0;
4133   else if (npeel < 0)
4134     {
4135       peel_iters_prologue = assumed_vf / 2;
4136       if (dump_enabled_p ())
4137         dump_printf (MSG_NOTE, "cost model: "
4138                      "prologue peel iters set to vf/2.\n");
4139
4140       /* If peeled iterations are unknown, count a taken branch and a not taken
4141          branch per peeled loop.  Even if scalar loop iterations are known,
4142          vector iterations are not known since peeled prologue iterations are
4143          not known.  Hence guards remain the same.  */
4144       prologue_need_br_taken_cost = true;
4145       prologue_need_br_not_taken_cost = true;
4146     }
4147   else
4148     {
4149       peel_iters_prologue = npeel;
4150       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4151         /* If peeled iterations are known but number of scalar loop
4152            iterations are unknown, count a taken branch per peeled loop.  */
4153         prologue_need_br_taken_cost = true;
4154     }
4155
4156   bool epilogue_need_br_taken_cost = false;
4157   bool epilogue_need_br_not_taken_cost = false;
4158
4159   /* Calculate peel_iters_epilogue.  */
4160   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4161     /* We need to peel exactly one iteration for gaps.  */
4162     peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4163   else if (npeel < 0)
4164     {
4165       /* If peeling for alignment is unknown, loop bound of main loop
4166          becomes unknown.  */
4167       peel_iters_epilogue = assumed_vf / 2;
4168       if (dump_enabled_p ())
4169         dump_printf (MSG_NOTE, "cost model: "
4170                      "epilogue peel iters set to vf/2 because "
4171                      "peeling for alignment is unknown.\n");
4172
4173       /* See the same reason above in peel_iters_prologue calculation.  */
4174       epilogue_need_br_taken_cost = true;
4175       epilogue_need_br_not_taken_cost = true;
4176     }
4177   else
4178     {
4179       peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4180       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4181         /* If peeled iterations are known but number of scalar loop
4182            iterations are unknown, count a taken branch per peeled loop.  */
4183         epilogue_need_br_taken_cost = true;
4184     }
4185
4186   stmt_info_for_cost *si;
4187   int j;
4188   /* Add costs associated with peel_iters_prologue.  */
4189   if (peel_iters_prologue)
4190     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4191       {
4192         (void) add_stmt_cost (target_cost_data,
4193                               si->count * peel_iters_prologue, si->kind,
4194                               si->stmt_info, si->node, si->vectype,
4195                               si->misalign, vect_prologue);
4196       }
4197
4198   /* Add costs associated with peel_iters_epilogue.  */
4199   if (peel_iters_epilogue)
4200     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4201       {
4202         (void) add_stmt_cost (target_cost_data,
4203                               si->count * peel_iters_epilogue, si->kind,
4204                               si->stmt_info, si->node, si->vectype,
4205                               si->misalign, vect_epilogue);
4206       }
4207
4208   /* Add possible cond_branch_taken/cond_branch_not_taken cost.  */
4209
4210   if (prologue_need_br_taken_cost)
4211     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4212                           vect_prologue);
4213
4214   if (prologue_need_br_not_taken_cost)
4215     (void) add_stmt_cost (target_cost_data, 1,
4216                           cond_branch_not_taken, vect_prologue);
4217
4218   if (epilogue_need_br_taken_cost)
4219     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4220                           vect_epilogue);
4221
4222   if (epilogue_need_br_not_taken_cost)
4223     (void) add_stmt_cost (target_cost_data, 1,
4224                           cond_branch_not_taken, vect_epilogue);
4225
4226   /* Take care of special costs for rgroup controls of partial vectors.  */
4227   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
4228     {
4229       /* Calculate how many masks we need to generate.  */
4230       unsigned int num_masks = 0;
4231       rgroup_controls *rgm;
4232       unsigned int num_vectors_m1;
4233       FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
4234         if (rgm->type)
4235           num_masks += num_vectors_m1 + 1;
4236       gcc_assert (num_masks > 0);
4237
4238       /* In the worst case, we need to generate each mask in the prologue
4239          and in the loop body.  One of the loop body mask instructions
4240          replaces the comparison in the scalar loop, and since we don't
4241          count the scalar comparison against the scalar body, we shouldn't
4242          count that vector instruction against the vector body either.
4243
4244          Sometimes we can use unpacks instead of generating prologue
4245          masks and sometimes the prologue mask will fold to a constant,
4246          so the actual prologue cost might be smaller.  However, it's
4247          simpler and safer to use the worst-case cost; if this ends up
4248          being the tie-breaker between vectorizing or not, then it's
4249          probably better not to vectorize.  */
4250       (void) add_stmt_cost (target_cost_data, num_masks,
4251                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4252                             vect_prologue);
4253       (void) add_stmt_cost (target_cost_data, num_masks - 1,
4254                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4255                             vect_body);
4256     }
4257   else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4258     {
4259       /* Referring to the functions vect_set_loop_condition_partial_vectors
4260          and vect_set_loop_controls_directly, we need to generate each
4261          length in the prologue and in the loop body if required. Although
4262          there are some possible optimizations, we consider the worst case
4263          here.  */
4264
4265       bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4266       signed char partial_load_store_bias
4267         = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4268       bool need_iterate_p
4269         = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4270            && !vect_known_niters_smaller_than_vf (loop_vinfo));
4271
4272       /* Calculate how many statements to be added.  */
4273       unsigned int prologue_stmts = 0;
4274       unsigned int body_stmts = 0;
4275
4276       rgroup_controls *rgc;
4277       unsigned int num_vectors_m1;
4278       FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4279         if (rgc->type)
4280           {
4281             /* May need one SHIFT for nitems_total computation.  */
4282             unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4283             if (nitems != 1 && !niters_known_p)
4284               prologue_stmts += 1;
4285
4286             /* May need one MAX and one MINUS for wrap around.  */
4287             if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4288               prologue_stmts += 2;
4289
4290             /* Need one MAX and one MINUS for each batch limit excepting for
4291                the 1st one.  */
4292             prologue_stmts += num_vectors_m1 * 2;
4293
4294             unsigned int num_vectors = num_vectors_m1 + 1;
4295
4296             /* Need to set up lengths in prologue, only one MIN required
4297                for each since start index is zero.  */
4298             prologue_stmts += num_vectors;
4299
4300             /* If we have a non-zero partial load bias, we need one PLUS
4301                to adjust the load length.  */
4302             if (partial_load_store_bias != 0)
4303               body_stmts += 1;
4304
4305             /* Each may need two MINs and one MINUS to update lengths in body
4306                for next iteration.  */
4307             if (need_iterate_p)
4308               body_stmts += 3 * num_vectors;
4309           }
4310
4311       (void) add_stmt_cost (target_cost_data, prologue_stmts,
4312                             scalar_stmt, vect_prologue);
4313       (void) add_stmt_cost (target_cost_data, body_stmts,
4314                             scalar_stmt, vect_body);
4315     }
4316
4317   /* FORNOW: The scalar outside cost is incremented in one of the
4318      following ways:
4319
4320      1. The vectorizer checks for alignment and aliasing and generates
4321      a condition that allows dynamic vectorization.  A cost model
4322      check is ANDED with the versioning condition.  Hence scalar code
4323      path now has the added cost of the versioning check.
4324
4325        if (cost > th & versioning_check)
4326          jmp to vector code
4327
4328      Hence run-time scalar is incremented by not-taken branch cost.
4329
4330      2. The vectorizer then checks if a prologue is required.  If the
4331      cost model check was not done before during versioning, it has to
4332      be done before the prologue check.
4333
4334        if (cost <= th)
4335          prologue = scalar_iters
4336        if (prologue == 0)
4337          jmp to vector code
4338        else
4339          execute prologue
4340        if (prologue == num_iters)
4341          go to exit
4342
4343      Hence the run-time scalar cost is incremented by a taken branch,
4344      plus a not-taken branch, plus a taken branch cost.
4345
4346      3. The vectorizer then checks if an epilogue is required.  If the
4347      cost model check was not done before during prologue check, it
4348      has to be done with the epilogue check.
4349
4350        if (prologue == 0)
4351          jmp to vector code
4352        else
4353          execute prologue
4354        if (prologue == num_iters)
4355          go to exit
4356        vector code:
4357          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4358            jmp to epilogue
4359
4360      Hence the run-time scalar cost should be incremented by 2 taken
4361      branches.
4362
4363      TODO: The back end may reorder the BBS's differently and reverse
4364      conditions/branch directions.  Change the estimates below to
4365      something more reasonable.  */
4366
4367   /* If the number of iterations is known and we do not do versioning, we can
4368      decide whether to vectorize at compile time.  Hence the scalar version
4369      do not carry cost model guard costs.  */
4370   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4371       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4372     {
4373       /* Cost model check occurs at versioning.  */
4374       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4375         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4376       else
4377         {
4378           /* Cost model check occurs at prologue generation.  */
4379           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4380             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4381               + vect_get_stmt_cost (cond_branch_not_taken);
4382           /* Cost model check occurs at epilogue generation.  */
4383           else
4384             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4385         }
4386     }
4387
4388   /* Complete the target-specific cost calculations.  */
4389   finish_cost (loop_vinfo->vector_costs, loop_vinfo->scalar_costs,
4390                &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost,
4391                suggested_unroll_factor);
4392
4393   if (suggested_unroll_factor && *suggested_unroll_factor > 1
4394       && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
4395       && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
4396                     *suggested_unroll_factor,
4397                     LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
4398     {
4399       if (dump_enabled_p ())
4400         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4401                          "can't unroll as unrolled vectorization factor larger"
4402                          " than maximum vectorization factor: "
4403                          HOST_WIDE_INT_PRINT_UNSIGNED "\n",
4404                          LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
4405       *suggested_unroll_factor = 1;
4406     }
4407
4408   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4409
4410   if (dump_enabled_p ())
4411     {
4412       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4413       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
4414                    vec_inside_cost);
4415       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
4416                    vec_prologue_cost);
4417       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
4418                    vec_epilogue_cost);
4419       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
4420                    scalar_single_iter_cost);
4421       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
4422                    scalar_outside_cost);
4423       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
4424                    vec_outside_cost);
4425       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
4426                    peel_iters_prologue);
4427       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
4428                    peel_iters_epilogue);
4429     }
4430
4431   /* Calculate number of iterations required to make the vector version
4432      profitable, relative to the loop bodies only.  The following condition
4433      must hold true:
4434      SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4435      where
4436      SIC = scalar iteration cost, VIC = vector iteration cost,
4437      VOC = vector outside cost, VF = vectorization factor,
4438      NPEEL = prologue iterations + epilogue iterations,
4439      SOC = scalar outside cost for run time cost model check.  */
4440
4441   int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4442                           - vec_inside_cost);
4443   if (saving_per_viter <= 0)
4444     {
4445       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4446         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4447                     "vectorization did not happen for a simd loop");
4448
4449       if (dump_enabled_p ())
4450         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4451                          "cost model: the vector iteration cost = %d "
4452                          "divided by the scalar iteration cost = %d "
4453                          "is greater or equal to the vectorization factor = %d"
4454                          ".\n",
4455                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4456       *ret_min_profitable_niters = -1;
4457       *ret_min_profitable_estimate = -1;
4458       return;
4459     }
4460
4461   /* ??? The "if" arm is written to handle all cases; see below for what
4462      we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
4463   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4464     {
4465       /* Rewriting the condition above in terms of the number of
4466          vector iterations (vniters) rather than the number of
4467          scalar iterations (niters) gives:
4468
4469          SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
4470
4471          <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
4472
4473          For integer N, X and Y when X > 0:
4474
4475          N * X > Y <==> N >= (Y /[floor] X) + 1.  */
4476       int outside_overhead = (vec_outside_cost
4477                               - scalar_single_iter_cost * peel_iters_prologue
4478                               - scalar_single_iter_cost * peel_iters_epilogue
4479                               - scalar_outside_cost);
4480       /* We're only interested in cases that require at least one
4481          vector iteration.  */
4482       int min_vec_niters = 1;
4483       if (outside_overhead > 0)
4484         min_vec_niters = outside_overhead / saving_per_viter + 1;
4485
4486       if (dump_enabled_p ())
4487         dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
4488                      min_vec_niters);
4489
4490       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4491         {
4492           /* Now that we know the minimum number of vector iterations,
4493              find the minimum niters for which the scalar cost is larger:
4494
4495              SIC * niters > VIC * vniters + VOC - SOC
4496
4497              We know that the minimum niters is no more than
4498              vniters * VF + NPEEL, but it might be (and often is) less
4499              than that if a partial vector iteration is cheaper than the
4500              equivalent scalar code.  */
4501           int threshold = (vec_inside_cost * min_vec_niters
4502                            + vec_outside_cost
4503                            - scalar_outside_cost);
4504           if (threshold <= 0)
4505             min_profitable_iters = 1;
4506           else
4507             min_profitable_iters = threshold / scalar_single_iter_cost + 1;
4508         }
4509       else
4510         /* Convert the number of vector iterations into a number of
4511            scalar iterations.  */
4512         min_profitable_iters = (min_vec_niters * assumed_vf
4513                                 + peel_iters_prologue
4514                                 + peel_iters_epilogue);
4515     }
4516   else
4517     {
4518       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4519                               * assumed_vf
4520                               - vec_inside_cost * peel_iters_prologue
4521                               - vec_inside_cost * peel_iters_epilogue);
4522       if (min_profitable_iters <= 0)
4523         min_profitable_iters = 0;
4524       else
4525         {
4526           min_profitable_iters /= saving_per_viter;
4527
4528           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4529               <= (((int) vec_inside_cost * min_profitable_iters)
4530                   + (((int) vec_outside_cost - scalar_outside_cost)
4531                      * assumed_vf)))
4532             min_profitable_iters++;
4533         }
4534     }
4535
4536   if (dump_enabled_p ())
4537     dump_printf (MSG_NOTE,
4538                  "  Calculated minimum iters for profitability: %d\n",
4539                  min_profitable_iters);
4540
4541   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
4542       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4543     /* We want the vectorized loop to execute at least once.  */
4544     min_profitable_iters = assumed_vf + peel_iters_prologue;
4545   else if (min_profitable_iters < peel_iters_prologue)
4546     /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4547        vectorized loop executes at least once.  */
4548     min_profitable_iters = peel_iters_prologue;
4549
4550   if (dump_enabled_p ())
4551     dump_printf_loc (MSG_NOTE, vect_location,
4552                      "  Runtime profitability threshold = %d\n",
4553                      min_profitable_iters);
4554
4555   *ret_min_profitable_niters = min_profitable_iters;
4556
4557   /* Calculate number of iterations required to make the vector version
4558      profitable, relative to the loop bodies only.
4559
4560      Non-vectorized variant is SIC * niters and it must win over vector
4561      variant on the expected loop trip count.  The following condition must hold true:
4562      SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
4563
4564   if (vec_outside_cost <= 0)
4565     min_profitable_estimate = 0;
4566   /* ??? This "else if" arm is written to handle all cases; see below for
4567      what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
4568   else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4569     {
4570       /* This is a repeat of the code above, but with + SOC rather
4571          than - SOC.  */
4572       int outside_overhead = (vec_outside_cost
4573                               - scalar_single_iter_cost * peel_iters_prologue
4574                               - scalar_single_iter_cost * peel_iters_epilogue
4575                               + scalar_outside_cost);
4576       int min_vec_niters = 1;
4577       if (outside_overhead > 0)
4578         min_vec_niters = outside_overhead / saving_per_viter + 1;
4579
4580       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4581         {
4582           int threshold = (vec_inside_cost * min_vec_niters
4583                            + vec_outside_cost
4584                            + scalar_outside_cost);
4585           min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
4586         }
4587       else
4588         min_profitable_estimate = (min_vec_niters * assumed_vf
4589                                    + peel_iters_prologue
4590                                    + peel_iters_epilogue);
4591     }
4592   else
4593     {
4594       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4595                                  * assumed_vf
4596                                  - vec_inside_cost * peel_iters_prologue
4597                                  - vec_inside_cost * peel_iters_epilogue)
4598                                  / ((scalar_single_iter_cost * assumed_vf)
4599                                    - vec_inside_cost);
4600     }
4601   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4602   if (dump_enabled_p ())
4603     dump_printf_loc (MSG_NOTE, vect_location,
4604                      "  Static estimate profitability threshold = %d\n",
4605                      min_profitable_estimate);
4606
4607   *ret_min_profitable_estimate = min_profitable_estimate;
4608 }
4609
4610 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4611    vector elements (not bits) for a vector with NELT elements.  */
4612 static void
4613 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4614                               vec_perm_builder *sel)
4615 {
4616   /* The encoding is a single stepped pattern.  Any wrap-around is handled
4617      by vec_perm_indices.  */
4618   sel->new_vector (nelt, 1, 3);
4619   for (unsigned int i = 0; i < 3; i++)
4620     sel->quick_push (i + offset);
4621 }
4622
4623 /* Checks whether the target supports whole-vector shifts for vectors of mode
4624    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
4625    it supports vec_perm_const with masks for all necessary shift amounts.  */
4626 static bool
4627 have_whole_vector_shift (machine_mode mode)
4628 {
4629   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4630     return true;
4631
4632   /* Variable-length vectors should be handled via the optab.  */
4633   unsigned int nelt;
4634   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4635     return false;
4636
4637   vec_perm_builder sel;
4638   vec_perm_indices indices;
4639   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4640     {
4641       calc_vec_perm_mask_for_shift (i, nelt, &sel);
4642       indices.new_vector (sel, 2, nelt);
4643       if (!can_vec_perm_const_p (mode, mode, indices, false))
4644         return false;
4645     }
4646   return true;
4647 }
4648
4649 /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
4650    multiplication operands have differing signs and (b) we intend
4651    to emulate the operation using a series of signed DOT_PROD_EXPRs.
4652    See vect_emulate_mixed_dot_prod for the actual sequence used.  */
4653
4654 static bool
4655 vect_is_emulated_mixed_dot_prod (loop_vec_info loop_vinfo,
4656                                  stmt_vec_info stmt_info)
4657 {
4658   gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
4659   if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
4660     return false;
4661
4662   tree rhs1 = gimple_assign_rhs1 (assign);
4663   tree rhs2 = gimple_assign_rhs2 (assign);
4664   if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
4665     return false;
4666
4667   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
4668   gcc_assert (reduc_info->is_reduc_info);
4669   return !directly_supported_p (DOT_PROD_EXPR,
4670                                 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info),
4671                                 optab_vector_mixed_sign);
4672 }
4673
4674 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4675    functions. Design better to avoid maintenance issues.  */
4676
4677 /* Function vect_model_reduction_cost.
4678
4679    Models cost for a reduction operation, including the vector ops
4680    generated within the strip-mine loop in some cases, the initial
4681    definition before the loop, and the epilogue code that must be generated.  */
4682
4683 static void
4684 vect_model_reduction_cost (loop_vec_info loop_vinfo,
4685                            stmt_vec_info stmt_info, internal_fn reduc_fn,
4686                            vect_reduction_type reduction_type,
4687                            int ncopies, stmt_vector_for_cost *cost_vec)
4688 {
4689   int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
4690   tree vectype;
4691   machine_mode mode;
4692   class loop *loop = NULL;
4693
4694   if (loop_vinfo)
4695     loop = LOOP_VINFO_LOOP (loop_vinfo);
4696
4697   /* Condition reductions generate two reductions in the loop.  */
4698   if (reduction_type == COND_REDUCTION)
4699     ncopies *= 2;
4700
4701   vectype = STMT_VINFO_VECTYPE (stmt_info);
4702   mode = TYPE_MODE (vectype);
4703   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4704
4705   gimple_match_op op;
4706   if (!gimple_extract_op (orig_stmt_info->stmt, &op))
4707     gcc_unreachable ();
4708
4709   bool emulated_mixed_dot_prod
4710     = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
4711   if (reduction_type == EXTRACT_LAST_REDUCTION)
4712     /* No extra instructions are needed in the prologue.  The loop body
4713        operations are costed in vectorizable_condition.  */
4714     inside_cost = 0;
4715   else if (reduction_type == FOLD_LEFT_REDUCTION)
4716     {
4717       /* No extra instructions needed in the prologue.  */
4718       prologue_cost = 0;
4719
4720       if (reduc_fn != IFN_LAST)
4721         /* Count one reduction-like operation per vector.  */
4722         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
4723                                         stmt_info, 0, vect_body);
4724       else
4725         {
4726           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
4727           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4728           inside_cost = record_stmt_cost (cost_vec, nelements,
4729                                           vec_to_scalar, stmt_info, 0,
4730                                           vect_body);
4731           inside_cost += record_stmt_cost (cost_vec, nelements,
4732                                            scalar_stmt, stmt_info, 0,
4733                                            vect_body);
4734         }
4735     }
4736   else
4737     {
4738       /* Add in the cost of the initial definitions.  */
4739       int prologue_stmts;
4740       if (reduction_type == COND_REDUCTION)
4741         /* For cond reductions we have four vectors: initial index, step,
4742            initial result of the data reduction, initial value of the index
4743            reduction.  */
4744         prologue_stmts = 4;
4745       else if (emulated_mixed_dot_prod)
4746         /* We need the initial reduction value and two invariants:
4747            one that contains the minimum signed value and one that
4748            contains half of its negative.  */
4749         prologue_stmts = 3;
4750       else
4751         prologue_stmts = 1;
4752       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
4753                                          scalar_to_vec, stmt_info, 0,
4754                                          vect_prologue);
4755     }
4756
4757   /* Determine cost of epilogue code.
4758
4759      We have a reduction operator that will reduce the vector in one statement.
4760      Also requires scalar extract.  */
4761
4762   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4763     {
4764       if (reduc_fn != IFN_LAST)
4765         {
4766           if (reduction_type == COND_REDUCTION)
4767             {
4768               /* An EQ stmt and an COND_EXPR stmt.  */
4769               epilogue_cost += record_stmt_cost (cost_vec, 2,
4770                                                  vector_stmt, stmt_info, 0,
4771                                                  vect_epilogue);
4772               /* Reduction of the max index and a reduction of the found
4773                  values.  */
4774               epilogue_cost += record_stmt_cost (cost_vec, 2,
4775                                                  vec_to_scalar, stmt_info, 0,
4776                                                  vect_epilogue);
4777               /* A broadcast of the max value.  */
4778               epilogue_cost += record_stmt_cost (cost_vec, 1,
4779                                                  scalar_to_vec, stmt_info, 0,
4780                                                  vect_epilogue);
4781             }
4782           else
4783             {
4784               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4785                                                  stmt_info, 0, vect_epilogue);
4786               epilogue_cost += record_stmt_cost (cost_vec, 1,
4787                                                  vec_to_scalar, stmt_info, 0,
4788                                                  vect_epilogue);
4789             }
4790         }
4791       else if (reduction_type == COND_REDUCTION)
4792         {
4793           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4794           /* Extraction of scalar elements.  */
4795           epilogue_cost += record_stmt_cost (cost_vec,
4796                                              2 * estimated_nunits,
4797                                              vec_to_scalar, stmt_info, 0,
4798                                              vect_epilogue);
4799           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
4800           epilogue_cost += record_stmt_cost (cost_vec,
4801                                              2 * estimated_nunits - 3,
4802                                              scalar_stmt, stmt_info, 0,
4803                                              vect_epilogue);
4804         }
4805       else if (reduction_type == EXTRACT_LAST_REDUCTION
4806                || reduction_type == FOLD_LEFT_REDUCTION)
4807         /* No extra instructions need in the epilogue.  */
4808         ;
4809       else
4810         {
4811           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4812           tree bitsize = TYPE_SIZE (op.type);
4813           int element_bitsize = tree_to_uhwi (bitsize);
4814           int nelements = vec_size_in_bits / element_bitsize;
4815
4816           if (op.code == COND_EXPR)
4817             op.code = MAX_EXPR;
4818
4819           /* We have a whole vector shift available.  */
4820           if (VECTOR_MODE_P (mode)
4821               && directly_supported_p (op.code, vectype)
4822               && have_whole_vector_shift (mode))
4823             {
4824               /* Final reduction via vector shifts and the reduction operator.
4825                  Also requires scalar extract.  */
4826               epilogue_cost += record_stmt_cost (cost_vec,
4827                                                  exact_log2 (nelements) * 2,
4828                                                  vector_stmt, stmt_info, 0,
4829                                                  vect_epilogue);
4830               epilogue_cost += record_stmt_cost (cost_vec, 1,
4831                                                  vec_to_scalar, stmt_info, 0,
4832                                                  vect_epilogue);
4833             }
4834           else
4835             /* Use extracts and reduction op for final reduction.  For N
4836                elements, we have N extracts and N-1 reduction ops.  */
4837             epilogue_cost += record_stmt_cost (cost_vec,
4838                                                nelements + nelements - 1,
4839                                                vector_stmt, stmt_info, 0,
4840                                                vect_epilogue);
4841         }
4842     }
4843
4844   if (dump_enabled_p ())
4845     dump_printf (MSG_NOTE,
4846                  "vect_model_reduction_cost: inside_cost = %d, "
4847                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4848                  prologue_cost, epilogue_cost);
4849 }
4850
4851 /* SEQ is a sequence of instructions that initialize the reduction
4852    described by REDUC_INFO.  Emit them in the appropriate place.  */
4853
4854 static void
4855 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
4856                                 stmt_vec_info reduc_info, gimple *seq)
4857 {
4858   if (reduc_info->reused_accumulator)
4859     {
4860       /* When reusing an accumulator from the main loop, we only need
4861          initialization instructions if the main loop can be skipped.
4862          In that case, emit the initialization instructions at the end
4863          of the guard block that does the skip.  */
4864       edge skip_edge = loop_vinfo->skip_main_loop_edge;
4865       gcc_assert (skip_edge);
4866       gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
4867       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
4868     }
4869   else
4870     {
4871       /* The normal case: emit the initialization instructions on the
4872          preheader edge.  */
4873       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4874       gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
4875     }
4876 }
4877
4878 /* Function get_initial_def_for_reduction
4879
4880    Input:
4881    REDUC_INFO - the info_for_reduction
4882    INIT_VAL - the initial value of the reduction variable
4883    NEUTRAL_OP - a value that has no effect on the reduction, as per
4884                 neutral_op_for_reduction
4885
4886    Output:
4887    Return a vector variable, initialized according to the operation that
4888         STMT_VINFO performs. This vector will be used as the initial value
4889         of the vector of partial results.
4890
4891    The value we need is a vector in which element 0 has value INIT_VAL
4892    and every other element has value NEUTRAL_OP.  */
4893
4894 static tree
4895 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
4896                                stmt_vec_info reduc_info,
4897                                tree init_val, tree neutral_op)
4898 {
4899   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4900   tree scalar_type = TREE_TYPE (init_val);
4901   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
4902   tree init_def;
4903   gimple_seq stmts = NULL;
4904
4905   gcc_assert (vectype);
4906
4907   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4908               || SCALAR_FLOAT_TYPE_P (scalar_type));
4909
4910   gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
4911               || loop == (gimple_bb (reduc_info->stmt))->loop_father);
4912
4913   if (operand_equal_p (init_val, neutral_op))
4914     {
4915       /* If both elements are equal then the vector described above is
4916          just a splat.  */
4917       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
4918       init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
4919     }
4920   else
4921     {
4922       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
4923       init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4924       if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4925         {
4926           /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
4927              element 0.  */
4928           init_def = gimple_build_vector_from_val (&stmts, vectype,
4929                                                    neutral_op);
4930           init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4931                                    vectype, init_def, init_val);
4932         }
4933       else
4934         {
4935           /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}.  */
4936           tree_vector_builder elts (vectype, 1, 2);
4937           elts.quick_push (init_val);
4938           elts.quick_push (neutral_op);
4939           init_def = gimple_build_vector (&stmts, &elts);
4940         }
4941     }
4942
4943   if (stmts)
4944     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
4945   return init_def;
4946 }
4947
4948 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
4949    which performs a reduction involving GROUP_SIZE scalar statements.
4950    NUMBER_OF_VECTORS is the number of vector defs to create.  If NEUTRAL_OP
4951    is nonnull, introducing extra elements of that value will not change the
4952    result.  */
4953
4954 static void
4955 get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
4956                                 stmt_vec_info reduc_info,
4957                                 vec<tree> *vec_oprnds,
4958                                 unsigned int number_of_vectors,
4959                                 unsigned int group_size, tree neutral_op)
4960 {
4961   vec<tree> &initial_values = reduc_info->reduc_initial_values;
4962   unsigned HOST_WIDE_INT nunits;
4963   unsigned j, number_of_places_left_in_vector;
4964   tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
4965   unsigned int i;
4966
4967   gcc_assert (group_size == initial_values.length () || neutral_op);
4968
4969   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4970      created vectors. It is greater than 1 if unrolling is performed.
4971
4972      For example, we have two scalar operands, s1 and s2 (e.g., group of
4973      strided accesses of size two), while NUNITS is four (i.e., four scalars
4974      of this type can be packed in a vector).  The output vector will contain
4975      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4976      will be 2).
4977
4978      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4979      vectors containing the operands.
4980
4981      For example, NUNITS is four as before, and the group size is 8
4982      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4983      {s5, s6, s7, s8}.  */
4984
4985   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4986     nunits = group_size;
4987
4988   number_of_places_left_in_vector = nunits;
4989   bool constant_p = true;
4990   tree_vector_builder elts (vector_type, nunits, 1);
4991   elts.quick_grow (nunits);
4992   gimple_seq ctor_seq = NULL;
4993   for (j = 0; j < nunits * number_of_vectors; ++j)
4994     {
4995       tree op;
4996       i = j % group_size;
4997
4998       /* Get the def before the loop.  In reduction chain we have only
4999          one initial value.  Else we have as many as PHIs in the group.  */
5000       if (i >= initial_values.length () || (j > i && neutral_op))
5001         op = neutral_op;
5002       else
5003         op = initial_values[i];
5004
5005       /* Create 'vect_ = {op0,op1,...,opn}'.  */
5006       number_of_places_left_in_vector--;
5007       elts[nunits - number_of_places_left_in_vector - 1] = op;
5008       if (!CONSTANT_CLASS_P (op))
5009         constant_p = false;
5010
5011       if (number_of_places_left_in_vector == 0)
5012         {
5013           tree init;
5014           if (constant_p && !neutral_op
5015               ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
5016               : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
5017             /* Build the vector directly from ELTS.  */
5018             init = gimple_build_vector (&ctor_seq, &elts);
5019           else if (neutral_op)
5020             {
5021               /* Build a vector of the neutral value and shift the
5022                  other elements into place.  */
5023               init = gimple_build_vector_from_val (&ctor_seq, vector_type,
5024                                                    neutral_op);
5025               int k = nunits;
5026               while (k > 0 && elts[k - 1] == neutral_op)
5027                 k -= 1;
5028               while (k > 0)
5029                 {
5030                   k -= 1;
5031                   init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
5032                                        vector_type, init, elts[k]);
5033                 }
5034             }
5035           else
5036             {
5037               /* First time round, duplicate ELTS to fill the
5038                  required number of vectors.  */
5039               duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
5040                                         elts, number_of_vectors, *vec_oprnds);
5041               break;
5042             }
5043           vec_oprnds->quick_push (init);
5044
5045           number_of_places_left_in_vector = nunits;
5046           elts.new_vector (vector_type, nunits, 1);
5047           elts.quick_grow (nunits);
5048           constant_p = true;
5049         }
5050     }
5051   if (ctor_seq != NULL)
5052     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
5053 }
5054
5055 /* For a statement STMT_INFO taking part in a reduction operation return
5056    the stmt_vec_info the meta information is stored on.  */
5057
5058 stmt_vec_info
5059 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
5060 {
5061   stmt_info = vect_orig_stmt (stmt_info);
5062   gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
5063   if (!is_a <gphi *> (stmt_info->stmt)
5064       || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
5065     stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5066   gphi *phi = as_a <gphi *> (stmt_info->stmt);
5067   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5068     {
5069       if (gimple_phi_num_args (phi) == 1)
5070         stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5071     }
5072   else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
5073     {
5074       stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
5075       if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
5076         stmt_info = info;
5077     }
5078   return stmt_info;
5079 }
5080
5081 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
5082    REDUC_INFO can build on.  Adjust REDUC_INFO and return true if so, otherwise
5083    return false.  */
5084
5085 static bool
5086 vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
5087                                 stmt_vec_info reduc_info)
5088 {
5089   loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
5090   if (!main_loop_vinfo)
5091     return false;
5092
5093   if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
5094     return false;
5095
5096   unsigned int num_phis = reduc_info->reduc_initial_values.length ();
5097   auto_vec<tree, 16> main_loop_results (num_phis);
5098   auto_vec<tree, 16> initial_values (num_phis);
5099   if (edge main_loop_edge = loop_vinfo->main_loop_edge)
5100     {
5101       /* The epilogue loop can be entered either from the main loop or
5102          from an earlier guard block.  */
5103       edge skip_edge = loop_vinfo->skip_main_loop_edge;
5104       for (tree incoming_value : reduc_info->reduc_initial_values)
5105         {
5106           /* Look for:
5107
5108                INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
5109                                     INITIAL_VALUE(guard block)>.  */
5110           gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
5111
5112           gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
5113           gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
5114
5115           tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
5116           tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
5117
5118           main_loop_results.quick_push (from_main_loop);
5119           initial_values.quick_push (from_skip);
5120         }
5121     }
5122   else
5123     /* The main loop dominates the epilogue loop.  */
5124     main_loop_results.splice (reduc_info->reduc_initial_values);
5125
5126   /* See if the main loop has the kind of accumulator we need.  */
5127   vect_reusable_accumulator *accumulator
5128     = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
5129   if (!accumulator
5130       || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
5131       || !std::equal (main_loop_results.begin (), main_loop_results.end (),
5132                       accumulator->reduc_info->reduc_scalar_results.begin ()))
5133     return false;
5134
5135   /* Handle the case where we can reduce wider vectors to narrower ones.  */
5136   tree vectype = STMT_VINFO_VECTYPE (reduc_info);
5137   tree old_vectype = TREE_TYPE (accumulator->reduc_input);
5138   unsigned HOST_WIDE_INT m;
5139   if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
5140                             TYPE_VECTOR_SUBPARTS (vectype), &m))
5141     return false;
5142   /* Check the intermediate vector types and operations are available.  */
5143   tree prev_vectype = old_vectype;
5144   poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
5145   while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
5146     {
5147       intermediate_nunits = exact_div (intermediate_nunits, 2);
5148       tree intermediate_vectype = get_related_vectype_for_scalar_type
5149         (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
5150       if (!intermediate_vectype
5151           || !directly_supported_p (STMT_VINFO_REDUC_CODE (reduc_info),
5152                                     intermediate_vectype)
5153           || !can_vec_extract (TYPE_MODE (prev_vectype),
5154                                TYPE_MODE (intermediate_vectype)))
5155         return false;
5156       prev_vectype = intermediate_vectype;
5157     }
5158
5159   /* Non-SLP reductions might apply an adjustment after the reduction
5160      operation, in order to simplify the initialization of the accumulator.
5161      If the epilogue loop carries on from where the main loop left off,
5162      it should apply the same adjustment to the final reduction result.
5163
5164      If the epilogue loop can also be entered directly (rather than via
5165      the main loop), we need to be able to handle that case in the same way,
5166      with the same adjustment.  (In principle we could add a PHI node
5167      to select the correct adjustment, but in practice that shouldn't be
5168      necessary.)  */
5169   tree main_adjustment
5170     = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5171   if (loop_vinfo->main_loop_edge && main_adjustment)
5172     {
5173       gcc_assert (num_phis == 1);
5174       tree initial_value = initial_values[0];
5175       /* Check that we can use INITIAL_VALUE as the adjustment and
5176          initialize the accumulator with a neutral value instead.  */
5177       if (!operand_equal_p (initial_value, main_adjustment))
5178         return false;
5179       code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5180       initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
5181                                                     code, initial_value);
5182     }
5183   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5184   reduc_info->reduc_initial_values.truncate (0);
5185   reduc_info->reduc_initial_values.splice (initial_values);
5186   reduc_info->reused_accumulator = accumulator;
5187   return true;
5188 }
5189
5190 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5191    CODE emitting stmts before GSI.  Returns a vector def of VECTYPE.  */
5192
5193 static tree
5194 vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5195                             gimple_seq *seq)
5196 {
5197   unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5198   unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5199   tree stype = TREE_TYPE (vectype);
5200   tree new_temp = vec_def;
5201   while (nunits > nunits1)
5202     {
5203       nunits /= 2;
5204       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5205                                                            stype, nunits);
5206       unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5207
5208       /* The target has to make sure we support lowpart/highpart
5209          extraction, either via direct vector extract or through
5210          an integer mode punning.  */
5211       tree dst1, dst2;
5212       gimple *epilog_stmt;
5213       if (convert_optab_handler (vec_extract_optab,
5214                                  TYPE_MODE (TREE_TYPE (new_temp)),
5215                                  TYPE_MODE (vectype1))
5216           != CODE_FOR_nothing)
5217         {
5218           /* Extract sub-vectors directly once vec_extract becomes
5219              a conversion optab.  */
5220           dst1 = make_ssa_name (vectype1);
5221           epilog_stmt
5222               = gimple_build_assign (dst1, BIT_FIELD_REF,
5223                                      build3 (BIT_FIELD_REF, vectype1,
5224                                              new_temp, TYPE_SIZE (vectype1),
5225                                              bitsize_int (0)));
5226           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5227           dst2 =  make_ssa_name (vectype1);
5228           epilog_stmt
5229               = gimple_build_assign (dst2, BIT_FIELD_REF,
5230                                      build3 (BIT_FIELD_REF, vectype1,
5231                                              new_temp, TYPE_SIZE (vectype1),
5232                                              bitsize_int (bitsize)));
5233           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5234         }
5235       else
5236         {
5237           /* Extract via punning to appropriately sized integer mode
5238              vector.  */
5239           tree eltype = build_nonstandard_integer_type (bitsize, 1);
5240           tree etype = build_vector_type (eltype, 2);
5241           gcc_assert (convert_optab_handler (vec_extract_optab,
5242                                              TYPE_MODE (etype),
5243                                              TYPE_MODE (eltype))
5244                       != CODE_FOR_nothing);
5245           tree tem = make_ssa_name (etype);
5246           epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5247                                              build1 (VIEW_CONVERT_EXPR,
5248                                                      etype, new_temp));
5249           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5250           new_temp = tem;
5251           tem = make_ssa_name (eltype);
5252           epilog_stmt
5253               = gimple_build_assign (tem, BIT_FIELD_REF,
5254                                      build3 (BIT_FIELD_REF, eltype,
5255                                              new_temp, TYPE_SIZE (eltype),
5256                                              bitsize_int (0)));
5257           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5258           dst1 = make_ssa_name (vectype1);
5259           epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5260                                              build1 (VIEW_CONVERT_EXPR,
5261                                                      vectype1, tem));
5262           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5263           tem = make_ssa_name (eltype);
5264           epilog_stmt
5265               = gimple_build_assign (tem, BIT_FIELD_REF,
5266                                      build3 (BIT_FIELD_REF, eltype,
5267                                              new_temp, TYPE_SIZE (eltype),
5268                                              bitsize_int (bitsize)));
5269           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5270           dst2 =  make_ssa_name (vectype1);
5271           epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5272                                              build1 (VIEW_CONVERT_EXPR,
5273                                                      vectype1, tem));
5274           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5275         }
5276
5277       new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5278     }
5279
5280   return new_temp;
5281 }
5282
5283 /* Function vect_create_epilog_for_reduction
5284
5285    Create code at the loop-epilog to finalize the result of a reduction
5286    computation.
5287
5288    STMT_INFO is the scalar reduction stmt that is being vectorized.
5289    SLP_NODE is an SLP node containing a group of reduction statements. The
5290      first one in this group is STMT_INFO.
5291    SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5292    REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5293      (counting from 0)
5294
5295    This function:
5296    1. Completes the reduction def-use cycles.
5297    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5298       by calling the function specified by REDUC_FN if available, or by
5299       other means (whole-vector shifts or a scalar loop).
5300       The function also creates a new phi node at the loop exit to preserve
5301       loop-closed form, as illustrated below.
5302
5303      The flow at the entry to this function:
5304
5305         loop:
5306           vec_def = phi <vec_init, null>        # REDUCTION_PHI
5307           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5308           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5309         loop_exit:
5310           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5311           use <s_out0>
5312           use <s_out0>
5313
5314      The above is transformed by this function into:
5315
5316         loop:
5317           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
5318           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5319           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5320         loop_exit:
5321           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5322           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5323           v_out2 = reduce <v_out1>
5324           s_out3 = extract_field <v_out2, 0>
5325           s_out4 = adjust_result <s_out3>
5326           use <s_out4>
5327           use <s_out4>
5328 */
5329
5330 static void
5331 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5332                                   stmt_vec_info stmt_info,
5333                                   slp_tree slp_node,
5334                                   slp_instance slp_node_instance)
5335 {
5336   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5337   gcc_assert (reduc_info->is_reduc_info);
5338   /* For double reductions we need to get at the inner loop reduction
5339      stmt which has the meta info attached.  Our stmt_info is that of the
5340      loop-closed PHI of the inner loop which we remember as
5341      def for the reduction PHI generation.  */
5342   bool double_reduc = false;
5343   stmt_vec_info rdef_info = stmt_info;
5344   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5345     {
5346       gcc_assert (!slp_node);
5347       double_reduc = true;
5348       stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5349                                             (stmt_info->stmt, 0));
5350       stmt_info = vect_stmt_to_vectorize (stmt_info);
5351     }
5352   gphi *reduc_def_stmt
5353     = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
5354   code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5355   internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
5356   tree vectype;
5357   machine_mode mode;
5358   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
5359   basic_block exit_bb;
5360   tree scalar_dest;
5361   tree scalar_type;
5362   gimple *new_phi = NULL, *phi;
5363   gimple_stmt_iterator exit_gsi;
5364   tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5365   gimple *epilog_stmt = NULL;
5366   gimple *exit_phi;
5367   tree bitsize;
5368   tree def;
5369   tree orig_name, scalar_result;
5370   imm_use_iterator imm_iter, phi_imm_iter;
5371   use_operand_p use_p, phi_use_p;
5372   gimple *use_stmt;
5373   auto_vec<tree> reduc_inputs;
5374   int j, i;
5375   vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
5376   unsigned int group_size = 1, k;
5377   auto_vec<gimple *> phis;
5378   /* SLP reduction without reduction chain, e.g.,
5379      # a1 = phi <a2, a0>
5380      # b1 = phi <b2, b0>
5381      a2 = operation (a1)
5382      b2 = operation (b1)  */
5383   bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5384   bool direct_slp_reduc;
5385   tree induction_index = NULL_TREE;
5386
5387   if (slp_node)
5388     group_size = SLP_TREE_LANES (slp_node);
5389
5390   if (nested_in_vect_loop_p (loop, stmt_info))
5391     {
5392       outer_loop = loop;
5393       loop = loop->inner;
5394       gcc_assert (!slp_node && double_reduc);
5395     }
5396
5397   vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
5398   gcc_assert (vectype);
5399   mode = TYPE_MODE (vectype);
5400
5401   tree induc_val = NULL_TREE;
5402   tree adjustment_def = NULL;
5403   if (slp_node)
5404     ;
5405   else
5406     {
5407       /* Optimize: for induction condition reduction, if we can't use zero
5408          for induc_val, use initial_def.  */
5409       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5410         induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
5411       else if (double_reduc)
5412         ;
5413       else
5414         adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
5415     }
5416
5417   stmt_vec_info single_live_out_stmt[] = { stmt_info };
5418   array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
5419   if (slp_reduc)
5420     /* All statements produce live-out values.  */
5421     live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
5422   else if (slp_node)
5423     {
5424       /* The last statement in the reduction chain produces the live-out
5425          value.  Note SLP optimization can shuffle scalar stmts to
5426          optimize permutations so we have to search for the last stmt.  */
5427       for (k = 0; k < group_size; ++k)
5428         if (!REDUC_GROUP_NEXT_ELEMENT (SLP_TREE_SCALAR_STMTS (slp_node)[k]))
5429           {
5430             single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5431             break;
5432           }
5433     }
5434
5435   unsigned vec_num;
5436   int ncopies;
5437   if (slp_node)
5438     {
5439       vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
5440       ncopies = 1;
5441     }
5442   else
5443     {
5444       stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
5445       vec_num = 1;
5446       ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
5447     }
5448
5449   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5450      which is updated with the current index of the loop for every match of
5451      the original loop's cond_expr (VEC_STMT).  This results in a vector
5452      containing the last time the condition passed for that vector lane.
5453      The first match will be a 1 to allow 0 to be used for non-matching
5454      indexes.  If there are no matches at all then the vector will be all
5455      zeroes.
5456
5457      PR92772: This algorithm is broken for architectures that support
5458      masked vectors, but do not provide fold_extract_last.  */
5459   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
5460     {
5461       auto_vec<std::pair<tree, bool>, 2> ccompares;
5462       stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
5463       cond_info = vect_stmt_to_vectorize (cond_info);
5464       while (cond_info != reduc_info)
5465         {
5466           if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
5467             {
5468               gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
5469               gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
5470               ccompares.safe_push
5471                 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
5472                                  STMT_VINFO_REDUC_IDX (cond_info) == 2));
5473             }
5474           cond_info
5475             = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
5476                                                  1 + STMT_VINFO_REDUC_IDX
5477                                                         (cond_info)));
5478           cond_info = vect_stmt_to_vectorize (cond_info);
5479         }
5480       gcc_assert (ccompares.length () != 0);
5481
5482       tree indx_before_incr, indx_after_incr;
5483       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
5484       int scalar_precision
5485         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
5486       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
5487       tree cr_index_vector_type = get_related_vectype_for_scalar_type
5488         (TYPE_MODE (vectype), cr_index_scalar_type,
5489          TYPE_VECTOR_SUBPARTS (vectype));
5490
5491       /* First we create a simple vector induction variable which starts
5492          with the values {1,2,3,...} (SERIES_VECT) and increments by the
5493          vector size (STEP).  */
5494
5495       /* Create a {1,2,3,...} vector.  */
5496       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
5497
5498       /* Create a vector of the step value.  */
5499       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
5500       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
5501
5502       /* Create an induction variable.  */
5503       gimple_stmt_iterator incr_gsi;
5504       bool insert_after;
5505       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5506       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
5507                  insert_after, &indx_before_incr, &indx_after_incr);
5508
5509       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
5510          filled with zeros (VEC_ZERO).  */
5511
5512       /* Create a vector of 0s.  */
5513       tree zero = build_zero_cst (cr_index_scalar_type);
5514       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
5515
5516       /* Create a vector phi node.  */
5517       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
5518       new_phi = create_phi_node (new_phi_tree, loop->header);
5519       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
5520                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
5521
5522       /* Now take the condition from the loops original cond_exprs
5523          and produce a new cond_exprs (INDEX_COND_EXPR) which for
5524          every match uses values from the induction variable
5525          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5526          (NEW_PHI_TREE).
5527          Finally, we update the phi (NEW_PHI_TREE) to take the value of
5528          the new cond_expr (INDEX_COND_EXPR).  */
5529       gimple_seq stmts = NULL;
5530       for (int i = ccompares.length () - 1; i != -1; --i)
5531         {
5532           tree ccompare = ccompares[i].first;
5533           if (ccompares[i].second)
5534             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5535                                          cr_index_vector_type,
5536                                          ccompare,
5537                                          indx_before_incr, new_phi_tree);
5538           else
5539             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5540                                          cr_index_vector_type,
5541                                          ccompare,
5542                                          new_phi_tree, indx_before_incr);
5543         }
5544       gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
5545
5546       /* Update the phi with the vec cond.  */
5547       induction_index = new_phi_tree;
5548       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
5549                    loop_latch_edge (loop), UNKNOWN_LOCATION);
5550     }
5551
5552   /* 2. Create epilog code.
5553         The reduction epilog code operates across the elements of the vector
5554         of partial results computed by the vectorized loop.
5555         The reduction epilog code consists of:
5556
5557         step 1: compute the scalar result in a vector (v_out2)
5558         step 2: extract the scalar result (s_out3) from the vector (v_out2)
5559         step 3: adjust the scalar result (s_out3) if needed.
5560
5561         Step 1 can be accomplished using one the following three schemes:
5562           (scheme 1) using reduc_fn, if available.
5563           (scheme 2) using whole-vector shifts, if available.
5564           (scheme 3) using a scalar loop. In this case steps 1+2 above are
5565                      combined.
5566
5567           The overall epilog code looks like this:
5568
5569           s_out0 = phi <s_loop>         # original EXIT_PHI
5570           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
5571           v_out2 = reduce <v_out1>              # step 1
5572           s_out3 = extract_field <v_out2, 0>    # step 2
5573           s_out4 = adjust_result <s_out3>       # step 3
5574
5575           (step 3 is optional, and steps 1 and 2 may be combined).
5576           Lastly, the uses of s_out0 are replaced by s_out4.  */
5577
5578
5579   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5580          v_out1 = phi <VECT_DEF>
5581          Store them in NEW_PHIS.  */
5582   if (double_reduc)
5583     loop = outer_loop;
5584   exit_bb = single_exit (loop)->dest;
5585   exit_gsi = gsi_after_labels (exit_bb);
5586   reduc_inputs.create (slp_node ? vec_num : ncopies);
5587   for (unsigned i = 0; i < vec_num; i++)
5588     {
5589       gimple_seq stmts = NULL;
5590       if (slp_node)
5591         def = vect_get_slp_vect_def (slp_node, i);
5592       else
5593         def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
5594       for (j = 0; j < ncopies; j++)
5595         {
5596           tree new_def = copy_ssa_name (def);
5597           phi = create_phi_node (new_def, exit_bb);
5598           if (j)
5599             def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
5600           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
5601           new_def = gimple_convert (&stmts, vectype, new_def);
5602           reduc_inputs.quick_push (new_def);
5603         }
5604       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5605     }
5606
5607   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
5608          (i.e. when reduc_fn is not available) and in the final adjustment
5609          code (if needed).  Also get the original scalar reduction variable as
5610          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
5611          represents a reduction pattern), the tree-code and scalar-def are
5612          taken from the original stmt that the pattern-stmt (STMT) replaces.
5613          Otherwise (it is a regular reduction) - the tree-code and scalar-def
5614          are taken from STMT.  */
5615
5616   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5617   if (orig_stmt_info != stmt_info)
5618     {
5619       /* Reduction pattern  */
5620       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5621       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
5622     }
5623
5624   scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
5625   scalar_type = TREE_TYPE (scalar_dest);
5626   scalar_results.truncate (0);
5627   scalar_results.reserve_exact (group_size);
5628   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5629   bitsize = TYPE_SIZE (scalar_type);
5630
5631   /* True if we should implement SLP_REDUC using native reduction operations
5632      instead of scalar operations.  */
5633   direct_slp_reduc = (reduc_fn != IFN_LAST
5634                       && slp_reduc
5635                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5636
5637   /* In case of reduction chain, e.g.,
5638      # a1 = phi <a3, a0>
5639      a2 = operation (a1)
5640      a3 = operation (a2),
5641
5642      we may end up with more than one vector result.  Here we reduce them
5643      to one vector.
5644
5645      The same is true if we couldn't use a single defuse cycle.  */
5646   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
5647       || direct_slp_reduc
5648       || ncopies > 1)
5649     {
5650       gimple_seq stmts = NULL;
5651       tree single_input = reduc_inputs[0];
5652       for (k = 1; k < reduc_inputs.length (); k++)
5653         single_input = gimple_build (&stmts, code, vectype,
5654                                      single_input, reduc_inputs[k]);
5655       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5656
5657       reduc_inputs.truncate (0);
5658       reduc_inputs.safe_push (single_input);
5659     }
5660
5661   tree orig_reduc_input = reduc_inputs[0];
5662
5663   /* If this loop is an epilogue loop that can be skipped after the
5664      main loop, we can only share a reduction operation between the
5665      main loop and the epilogue if we put it at the target of the
5666      skip edge.
5667
5668      We can still reuse accumulators if this check fails.  Doing so has
5669      the minor(?) benefit of making the epilogue loop's scalar result
5670      independent of the main loop's scalar result.  */
5671   bool unify_with_main_loop_p = false;
5672   if (reduc_info->reused_accumulator
5673       && loop_vinfo->skip_this_loop_edge
5674       && single_succ_p (exit_bb)
5675       && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
5676     {
5677       unify_with_main_loop_p = true;
5678
5679       basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
5680       reduc_inputs[0] = make_ssa_name (vectype);
5681       gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
5682       add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
5683                    UNKNOWN_LOCATION);
5684       add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
5685                    loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
5686       exit_gsi = gsi_after_labels (reduc_block);
5687     }
5688
5689   /* Shouldn't be used beyond this point.  */
5690   exit_bb = nullptr;
5691
5692   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5693       && reduc_fn != IFN_LAST)
5694     {
5695       /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
5696          various data values where the condition matched and another vector
5697          (INDUCTION_INDEX) containing all the indexes of those matches.  We
5698          need to extract the last matching index (which will be the index with
5699          highest value) and use this to index into the data vector.
5700          For the case where there were no matches, the data vector will contain
5701          all default values and the index vector will be all zeros.  */
5702
5703       /* Get various versions of the type of the vector of indexes.  */
5704       tree index_vec_type = TREE_TYPE (induction_index);
5705       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5706       tree index_scalar_type = TREE_TYPE (index_vec_type);
5707       tree index_vec_cmp_type = truth_type_for (index_vec_type);
5708
5709       /* Get an unsigned integer version of the type of the data vector.  */
5710       int scalar_precision
5711         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5712       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5713       tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
5714                                                 vectype);
5715
5716       /* First we need to create a vector (ZERO_VEC) of zeros and another
5717          vector (MAX_INDEX_VEC) filled with the last matching index, which we
5718          can create using a MAX reduction and then expanding.
5719          In the case where the loop never made any matches, the max index will
5720          be zero.  */
5721
5722       /* Vector of {0, 0, 0,...}.  */
5723       tree zero_vec = build_zero_cst (vectype);
5724
5725       /* Find maximum value from the vector of found indexes.  */
5726       tree max_index = make_ssa_name (index_scalar_type);
5727       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5728                                                           1, induction_index);
5729       gimple_call_set_lhs (max_index_stmt, max_index);
5730       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5731
5732       /* Vector of {max_index, max_index, max_index,...}.  */
5733       tree max_index_vec = make_ssa_name (index_vec_type);
5734       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5735                                                       max_index);
5736       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5737                                                         max_index_vec_rhs);
5738       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5739
5740       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5741          with the vector (INDUCTION_INDEX) of found indexes, choosing values
5742          from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
5743          otherwise.  Only one value should match, resulting in a vector
5744          (VEC_COND) with one data value and the rest zeros.
5745          In the case where the loop never made any matches, every index will
5746          match, resulting in a vector with all data values (which will all be
5747          the default value).  */
5748
5749       /* Compare the max index vector to the vector of found indexes to find
5750          the position of the max value.  */
5751       tree vec_compare = make_ssa_name (index_vec_cmp_type);
5752       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5753                                                       induction_index,
5754                                                       max_index_vec);
5755       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5756
5757       /* Use the compare to choose either values from the data vector or
5758          zero.  */
5759       tree vec_cond = make_ssa_name (vectype);
5760       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5761                                                    vec_compare,
5762                                                    reduc_inputs[0],
5763                                                    zero_vec);
5764       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5765
5766       /* Finally we need to extract the data value from the vector (VEC_COND)
5767          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
5768          reduction, but because this doesn't exist, we can use a MAX reduction
5769          instead.  The data value might be signed or a float so we need to cast
5770          it first.
5771          In the case where the loop never made any matches, the data values are
5772          all identical, and so will reduce down correctly.  */
5773
5774       /* Make the matched data values unsigned.  */
5775       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5776       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5777                                        vec_cond);
5778       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5779                                                         VIEW_CONVERT_EXPR,
5780                                                         vec_cond_cast_rhs);
5781       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5782
5783       /* Reduce down to a scalar value.  */
5784       tree data_reduc = make_ssa_name (scalar_type_unsigned);
5785       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5786                                                            1, vec_cond_cast);
5787       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5788       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5789
5790       /* Convert the reduced value back to the result type and set as the
5791          result.  */
5792       gimple_seq stmts = NULL;
5793       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5794                                data_reduc);
5795       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5796       scalar_results.safe_push (new_temp);
5797     }
5798   else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5799            && reduc_fn == IFN_LAST)
5800     {
5801       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
5802          idx = 0;
5803          idx_val = induction_index[0];
5804          val = data_reduc[0];
5805          for (idx = 0, val = init, i = 0; i < nelts; ++i)
5806            if (induction_index[i] > idx_val)
5807              val = data_reduc[i], idx_val = induction_index[i];
5808          return val;  */
5809
5810       tree data_eltype = TREE_TYPE (vectype);
5811       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5812       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5813       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5814       /* Enforced by vectorizable_reduction, which ensures we have target
5815          support before allowing a conditional reduction on variable-length
5816          vectors.  */
5817       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5818       tree idx_val = NULL_TREE, val = NULL_TREE;
5819       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5820         {
5821           tree old_idx_val = idx_val;
5822           tree old_val = val;
5823           idx_val = make_ssa_name (idx_eltype);
5824           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5825                                              build3 (BIT_FIELD_REF, idx_eltype,
5826                                                      induction_index,
5827                                                      bitsize_int (el_size),
5828                                                      bitsize_int (off)));
5829           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5830           val = make_ssa_name (data_eltype);
5831           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5832                                              build3 (BIT_FIELD_REF,
5833                                                      data_eltype,
5834                                                      reduc_inputs[0],
5835                                                      bitsize_int (el_size),
5836                                                      bitsize_int (off)));
5837           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5838           if (off != 0)
5839             {
5840               tree new_idx_val = idx_val;
5841               if (off != v_size - el_size)
5842                 {
5843                   new_idx_val = make_ssa_name (idx_eltype);
5844                   epilog_stmt = gimple_build_assign (new_idx_val,
5845                                                      MAX_EXPR, idx_val,
5846                                                      old_idx_val);
5847                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5848                 }
5849               tree cond = make_ssa_name (boolean_type_node);
5850               epilog_stmt = gimple_build_assign (cond, GT_EXPR,
5851                                                  idx_val, old_idx_val);
5852               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5853               tree new_val = make_ssa_name (data_eltype);
5854               epilog_stmt = gimple_build_assign (new_val, COND_EXPR,
5855                                                  cond, val, old_val);
5856               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5857               idx_val = new_idx_val;
5858               val = new_val;
5859             }
5860         }
5861       /* Convert the reduced value back to the result type and set as the
5862          result.  */
5863       gimple_seq stmts = NULL;
5864       val = gimple_convert (&stmts, scalar_type, val);
5865       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5866       scalar_results.safe_push (val);
5867     }
5868
5869   /* 2.3 Create the reduction code, using one of the three schemes described
5870          above. In SLP we simply need to extract all the elements from the
5871          vector (without reducing them), so we use scalar shifts.  */
5872   else if (reduc_fn != IFN_LAST && !slp_reduc)
5873     {
5874       tree tmp;
5875       tree vec_elem_type;
5876
5877       /* Case 1:  Create:
5878          v_out2 = reduc_expr <v_out1>  */
5879
5880       if (dump_enabled_p ())
5881         dump_printf_loc (MSG_NOTE, vect_location,
5882                          "Reduce using direct vector reduction.\n");
5883
5884       gimple_seq stmts = NULL;
5885       vec_elem_type = TREE_TYPE (vectype);
5886       new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
5887                                vec_elem_type, reduc_inputs[0]);
5888       new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5889       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5890
5891       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5892           && induc_val)
5893         {
5894           /* Earlier we set the initial value to be a vector if induc_val
5895              values.  Check the result and if it is induc_val then replace
5896              with the original initial value, unless induc_val is
5897              the same as initial_def already.  */
5898           tree zcompare = make_ssa_name (boolean_type_node);
5899           epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
5900                                              new_temp, induc_val);
5901           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5902           tree initial_def = reduc_info->reduc_initial_values[0];
5903           tmp = make_ssa_name (new_scalar_dest);
5904           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5905                                              initial_def, new_temp);
5906           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5907           new_temp = tmp;
5908         }
5909
5910       scalar_results.safe_push (new_temp);
5911     }
5912   else if (direct_slp_reduc)
5913     {
5914       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5915          with the elements for other SLP statements replaced with the
5916          neutral value.  We can then do a normal reduction on each vector.  */
5917
5918       /* Enforced by vectorizable_reduction.  */
5919       gcc_assert (reduc_inputs.length () == 1);
5920       gcc_assert (pow2p_hwi (group_size));
5921
5922       gimple_seq seq = NULL;
5923
5924       /* Build a vector {0, 1, 2, ...}, with the same number of elements
5925          and the same element size as VECTYPE.  */
5926       tree index = build_index_vector (vectype, 0, 1);
5927       tree index_type = TREE_TYPE (index);
5928       tree index_elt_type = TREE_TYPE (index_type);
5929       tree mask_type = truth_type_for (index_type);
5930
5931       /* Create a vector that, for each element, identifies which of
5932          the REDUC_GROUP_SIZE results should use it.  */
5933       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5934       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5935                             build_vector_from_val (index_type, index_mask));
5936
5937       /* Get a neutral vector value.  This is simply a splat of the neutral
5938          scalar value if we have one, otherwise the initial scalar value
5939          is itself a neutral value.  */
5940       tree vector_identity = NULL_TREE;
5941       tree neutral_op = NULL_TREE;
5942       if (slp_node)
5943         {
5944           tree initial_value = NULL_TREE;
5945           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5946             initial_value = reduc_info->reduc_initial_values[0];
5947           neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
5948                                                  initial_value);
5949         }
5950       if (neutral_op)
5951         vector_identity = gimple_build_vector_from_val (&seq, vectype,
5952                                                         neutral_op);
5953       for (unsigned int i = 0; i < group_size; ++i)
5954         {
5955           /* If there's no univeral neutral value, we can use the
5956              initial scalar value from the original PHI.  This is used
5957              for MIN and MAX reduction, for example.  */
5958           if (!neutral_op)
5959             {
5960               tree scalar_value = reduc_info->reduc_initial_values[i];
5961               scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
5962                                              scalar_value);
5963               vector_identity = gimple_build_vector_from_val (&seq, vectype,
5964                                                               scalar_value);
5965             }
5966
5967           /* Calculate the equivalent of:
5968
5969              sel[j] = (index[j] == i);
5970
5971              which selects the elements of REDUC_INPUTS[0] that should
5972              be included in the result.  */
5973           tree compare_val = build_int_cst (index_elt_type, i);
5974           compare_val = build_vector_from_val (index_type, compare_val);
5975           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5976                                    index, compare_val);
5977
5978           /* Calculate the equivalent of:
5979
5980              vec = seq ? reduc_inputs[0] : vector_identity;
5981
5982              VEC is now suitable for a full vector reduction.  */
5983           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5984                                    sel, reduc_inputs[0], vector_identity);
5985
5986           /* Do the reduction and convert it to the appropriate type.  */
5987           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5988                                       TREE_TYPE (vectype), vec);
5989           scalar = gimple_convert (&seq, scalar_type, scalar);
5990           scalar_results.safe_push (scalar);
5991         }
5992       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5993     }
5994   else
5995     {
5996       bool reduce_with_shift;
5997       tree vec_temp;
5998
5999       gcc_assert (slp_reduc || reduc_inputs.length () == 1);
6000
6001       /* See if the target wants to do the final (shift) reduction
6002          in a vector mode of smaller size and first reduce upper/lower
6003          halves against each other.  */
6004       enum machine_mode mode1 = mode;
6005       tree stype = TREE_TYPE (vectype);
6006       unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6007       unsigned nunits1 = nunits;
6008       if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
6009           && reduc_inputs.length () == 1)
6010         {
6011           nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6012           /* For SLP reductions we have to make sure lanes match up, but
6013              since we're doing individual element final reduction reducing
6014              vector width here is even more important.
6015              ???  We can also separate lanes with permutes, for the common
6016              case of power-of-two group-size odd/even extracts would work.  */
6017           if (slp_reduc && nunits != nunits1)
6018             {
6019               nunits1 = least_common_multiple (nunits1, group_size);
6020               gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
6021             }
6022         }
6023       if (!slp_reduc
6024           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
6025         nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6026
6027       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
6028                                                            stype, nunits1);
6029       reduce_with_shift = have_whole_vector_shift (mode1);
6030       if (!VECTOR_MODE_P (mode1)
6031           || !directly_supported_p (code, vectype1))
6032         reduce_with_shift = false;
6033
6034       /* First reduce the vector to the desired vector size we should
6035          do shift reduction on by combining upper and lower halves.  */
6036       gimple_seq stmts = NULL;
6037       new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
6038                                              code, &stmts);
6039       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6040       reduc_inputs[0] = new_temp;
6041
6042       if (reduce_with_shift && !slp_reduc)
6043         {
6044           int element_bitsize = tree_to_uhwi (bitsize);
6045           /* Enforced by vectorizable_reduction, which disallows SLP reductions
6046              for variable-length vectors and also requires direct target support
6047              for loop reductions.  */
6048           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6049           int nelements = vec_size_in_bits / element_bitsize;
6050           vec_perm_builder sel;
6051           vec_perm_indices indices;
6052
6053           int elt_offset;
6054
6055           tree zero_vec = build_zero_cst (vectype1);
6056           /* Case 2: Create:
6057              for (offset = nelements/2; offset >= 1; offset/=2)
6058                 {
6059                   Create:  va' = vec_shift <va, offset>
6060                   Create:  va = vop <va, va'>
6061                 }  */
6062
6063           tree rhs;
6064
6065           if (dump_enabled_p ())
6066             dump_printf_loc (MSG_NOTE, vect_location,
6067                              "Reduce using vector shifts\n");
6068
6069           gimple_seq stmts = NULL;
6070           new_temp = gimple_convert (&stmts, vectype1, new_temp);
6071           for (elt_offset = nelements / 2;
6072                elt_offset >= 1;
6073                elt_offset /= 2)
6074             {
6075               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
6076               indices.new_vector (sel, 2, nelements);
6077               tree mask = vect_gen_perm_mask_any (vectype1, indices);
6078               new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
6079                                        new_temp, zero_vec, mask);
6080               new_temp = gimple_build (&stmts, code,
6081                                        vectype1, new_name, new_temp);
6082             }
6083           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6084
6085           /* 2.4  Extract the final scalar result.  Create:
6086              s_out3 = extract_field <v_out2, bitpos>  */
6087
6088           if (dump_enabled_p ())
6089             dump_printf_loc (MSG_NOTE, vect_location,
6090                              "extract scalar result\n");
6091
6092           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
6093                         bitsize, bitsize_zero_node);
6094           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
6095           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
6096           gimple_assign_set_lhs (epilog_stmt, new_temp);
6097           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6098           scalar_results.safe_push (new_temp);
6099         }
6100       else
6101         {
6102           /* Case 3: Create:
6103              s = extract_field <v_out2, 0>
6104              for (offset = element_size;
6105                   offset < vector_size;
6106                   offset += element_size;)
6107                {
6108                  Create:  s' = extract_field <v_out2, offset>
6109                  Create:  s = op <s, s'>  // For non SLP cases
6110                }  */
6111
6112           if (dump_enabled_p ())
6113             dump_printf_loc (MSG_NOTE, vect_location,
6114                              "Reduce using scalar code.\n");
6115
6116           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6117           int element_bitsize = tree_to_uhwi (bitsize);
6118           tree compute_type = TREE_TYPE (vectype);
6119           gimple_seq stmts = NULL;
6120           FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
6121             {
6122               int bit_offset;
6123               new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
6124                                        vec_temp, bitsize, bitsize_zero_node);
6125
6126               /* In SLP we don't need to apply reduction operation, so we just
6127                  collect s' values in SCALAR_RESULTS.  */
6128               if (slp_reduc)
6129                 scalar_results.safe_push (new_temp);
6130
6131               for (bit_offset = element_bitsize;
6132                    bit_offset < vec_size_in_bits;
6133                    bit_offset += element_bitsize)
6134                 {
6135                   tree bitpos = bitsize_int (bit_offset);
6136                   new_name = gimple_build (&stmts, BIT_FIELD_REF,
6137                                            compute_type, vec_temp,
6138                                            bitsize, bitpos);
6139                   if (slp_reduc)
6140                     {
6141                       /* In SLP we don't need to apply reduction operation, so
6142                          we just collect s' values in SCALAR_RESULTS.  */
6143                       new_temp = new_name;
6144                       scalar_results.safe_push (new_name);
6145                     }
6146                   else
6147                     new_temp = gimple_build (&stmts, code, compute_type,
6148                                              new_name, new_temp);
6149                 }
6150             }
6151
6152           /* The only case where we need to reduce scalar results in SLP, is
6153              unrolling.  If the size of SCALAR_RESULTS is greater than
6154              REDUC_GROUP_SIZE, we reduce them combining elements modulo
6155              REDUC_GROUP_SIZE.  */
6156           if (slp_reduc)
6157             {
6158               tree res, first_res, new_res;
6159
6160               /* Reduce multiple scalar results in case of SLP unrolling.  */
6161               for (j = group_size; scalar_results.iterate (j, &res);
6162                    j++)
6163                 {
6164                   first_res = scalar_results[j % group_size];
6165                   new_res = gimple_build (&stmts, code, compute_type,
6166                                           first_res, res);
6167                   scalar_results[j % group_size] = new_res;
6168                 }
6169               scalar_results.truncate (group_size);
6170               for (k = 0; k < group_size; k++)
6171                 scalar_results[k] = gimple_convert (&stmts, scalar_type,
6172                                                     scalar_results[k]);
6173             }
6174           else
6175             {
6176               /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
6177               new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6178               scalar_results.safe_push (new_temp);
6179             }
6180
6181           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6182         }
6183
6184       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6185           && induc_val)
6186         {
6187           /* Earlier we set the initial value to be a vector if induc_val
6188              values.  Check the result and if it is induc_val then replace
6189              with the original initial value, unless induc_val is
6190              the same as initial_def already.  */
6191           tree zcompare = make_ssa_name (boolean_type_node);
6192           epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR, new_temp,
6193                                              induc_val);
6194           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6195           tree initial_def = reduc_info->reduc_initial_values[0];
6196           tree tmp = make_ssa_name (new_scalar_dest);
6197           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6198                                              initial_def, new_temp);
6199           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6200           scalar_results[0] = tmp;
6201         }
6202     }
6203
6204   /* 2.5 Adjust the final result by the initial value of the reduction
6205          variable. (When such adjustment is not needed, then
6206          'adjustment_def' is zero).  For example, if code is PLUS we create:
6207          new_temp = loop_exit_def + adjustment_def  */
6208
6209   if (adjustment_def)
6210     {
6211       gcc_assert (!slp_reduc);
6212       gimple_seq stmts = NULL;
6213       if (double_reduc)
6214         {
6215           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6216           adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6217           new_temp = gimple_build (&stmts, code, vectype,
6218                                    reduc_inputs[0], adjustment_def);
6219         }
6220       else
6221         {
6222           new_temp = scalar_results[0];
6223           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6224           adjustment_def = gimple_convert (&stmts, scalar_type, adjustment_def);
6225           new_temp = gimple_build (&stmts, code, scalar_type,
6226                                    new_temp, adjustment_def);
6227         }
6228
6229       epilog_stmt = gimple_seq_last_stmt (stmts);
6230       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6231       scalar_results[0] = new_temp;
6232     }
6233
6234   /* Record this operation if it could be reused by the epilogue loop.  */
6235   if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION)
6236     loop_vinfo->reusable_accumulators.put (scalar_results[0],
6237                                            { orig_reduc_input, reduc_info });
6238
6239   if (double_reduc)
6240     loop = outer_loop;
6241
6242   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
6243           phis with new adjusted scalar results, i.e., replace use <s_out0>
6244           with use <s_out4>.
6245
6246      Transform:
6247         loop_exit:
6248           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6249           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6250           v_out2 = reduce <v_out1>
6251           s_out3 = extract_field <v_out2, 0>
6252           s_out4 = adjust_result <s_out3>
6253           use <s_out0>
6254           use <s_out0>
6255
6256      into:
6257
6258         loop_exit:
6259           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6260           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6261           v_out2 = reduce <v_out1>
6262           s_out3 = extract_field <v_out2, 0>
6263           s_out4 = adjust_result <s_out3>
6264           use <s_out4>
6265           use <s_out4> */
6266
6267   gcc_assert (live_out_stmts.size () == scalar_results.length ());
6268   for (k = 0; k < live_out_stmts.size (); k++)
6269     {
6270       stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6271       scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6272
6273       phis.create (3);
6274       /* Find the loop-closed-use at the loop exit of the original scalar
6275          result.  (The reduction result is expected to have two immediate uses,
6276          one at the latch block, and one at the loop exit).  For double
6277          reductions we are looking for exit phis of the outer loop.  */
6278       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6279         {
6280           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6281             {
6282               if (!is_gimple_debug (USE_STMT (use_p)))
6283                 phis.safe_push (USE_STMT (use_p));
6284             }
6285           else
6286             {
6287               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6288                 {
6289                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
6290
6291                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6292                     {
6293                       if (!flow_bb_inside_loop_p (loop,
6294                                              gimple_bb (USE_STMT (phi_use_p)))
6295                           && !is_gimple_debug (USE_STMT (phi_use_p)))
6296                         phis.safe_push (USE_STMT (phi_use_p));
6297                     }
6298                 }
6299             }
6300         }
6301
6302       FOR_EACH_VEC_ELT (phis, i, exit_phi)
6303         {
6304           /* Replace the uses:  */
6305           orig_name = PHI_RESULT (exit_phi);
6306
6307           /* Look for a single use at the target of the skip edge.  */
6308           if (unify_with_main_loop_p)
6309             {
6310               use_operand_p use_p;
6311               gimple *user;
6312               if (!single_imm_use (orig_name, &use_p, &user))
6313                 gcc_unreachable ();
6314               orig_name = gimple_get_lhs (user);
6315             }
6316
6317           scalar_result = scalar_results[k];
6318           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6319             {
6320               FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6321                 SET_USE (use_p, scalar_result);
6322               update_stmt (use_stmt);
6323             }
6324         }
6325
6326       phis.release ();
6327     }
6328 }
6329
6330 /* Return a vector of type VECTYPE that is equal to the vector select
6331    operation "MASK ? VEC : IDENTITY".  Insert the select statements
6332    before GSI.  */
6333
6334 static tree
6335 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6336                      tree vec, tree identity)
6337 {
6338   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6339   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6340                                           mask, vec, identity);
6341   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6342   return cond;
6343 }
6344
6345 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6346    order, starting with LHS.  Insert the extraction statements before GSI and
6347    associate the new scalar SSA names with variable SCALAR_DEST.
6348    Return the SSA name for the result.  */
6349
6350 static tree
6351 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6352                        tree_code code, tree lhs, tree vector_rhs)
6353 {
6354   tree vectype = TREE_TYPE (vector_rhs);
6355   tree scalar_type = TREE_TYPE (vectype);
6356   tree bitsize = TYPE_SIZE (scalar_type);
6357   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6358   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6359
6360   for (unsigned HOST_WIDE_INT bit_offset = 0;
6361        bit_offset < vec_size_in_bits;
6362        bit_offset += element_bitsize)
6363     {
6364       tree bitpos = bitsize_int (bit_offset);
6365       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6366                          bitsize, bitpos);
6367
6368       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6369       rhs = make_ssa_name (scalar_dest, stmt);
6370       gimple_assign_set_lhs (stmt, rhs);
6371       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6372
6373       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6374       tree new_name = make_ssa_name (scalar_dest, stmt);
6375       gimple_assign_set_lhs (stmt, new_name);
6376       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6377       lhs = new_name;
6378     }
6379   return lhs;
6380 }
6381
6382 /* Get a masked internal function equivalent to REDUC_FN.  VECTYPE_IN is the
6383    type of the vector input.  */
6384
6385 static internal_fn
6386 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6387 {
6388   internal_fn mask_reduc_fn;
6389
6390   switch (reduc_fn)
6391     {
6392     case IFN_FOLD_LEFT_PLUS:
6393       mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6394       break;
6395
6396     default:
6397       return IFN_LAST;
6398     }
6399
6400   if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
6401                                       OPTIMIZE_FOR_SPEED))
6402     return mask_reduc_fn;
6403   return IFN_LAST;
6404 }
6405
6406 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
6407    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
6408    statement.  CODE is the operation performed by STMT_INFO and OPS are
6409    its scalar operands.  REDUC_INDEX is the index of the operand in
6410    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
6411    implements in-order reduction, or IFN_LAST if we should open-code it.
6412    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
6413    that should be used to control the operation in a fully-masked loop.  */
6414
6415 static bool
6416 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
6417                                stmt_vec_info stmt_info,
6418                                gimple_stmt_iterator *gsi,
6419                                gimple **vec_stmt, slp_tree slp_node,
6420                                gimple *reduc_def_stmt,
6421                                tree_code code, internal_fn reduc_fn,
6422                                tree ops[3], tree vectype_in,
6423                                int reduc_index, vec_loop_masks *masks)
6424 {
6425   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6426   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6427   internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
6428
6429   int ncopies;
6430   if (slp_node)
6431     ncopies = 1;
6432   else
6433     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6434
6435   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
6436   gcc_assert (ncopies == 1);
6437   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6438
6439   if (slp_node)
6440     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6441                           TYPE_VECTOR_SUBPARTS (vectype_in)));
6442
6443   tree op0 = ops[1 - reduc_index];
6444
6445   int group_size = 1;
6446   stmt_vec_info scalar_dest_def_info;
6447   auto_vec<tree> vec_oprnds0;
6448   if (slp_node)
6449     {
6450       auto_vec<vec<tree> > vec_defs (2);
6451       vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
6452       vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
6453       vec_defs[0].release ();
6454       vec_defs[1].release ();
6455       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6456       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6457     }
6458   else
6459     {
6460       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
6461                                      op0, &vec_oprnds0);
6462       scalar_dest_def_info = stmt_info;
6463     }
6464
6465   tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
6466   tree scalar_type = TREE_TYPE (scalar_dest);
6467   tree reduc_var = gimple_phi_result (reduc_def_stmt);
6468
6469   int vec_num = vec_oprnds0.length ();
6470   gcc_assert (vec_num == 1 || slp_node);
6471   tree vec_elem_type = TREE_TYPE (vectype_out);
6472   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6473
6474   tree vector_identity = NULL_TREE;
6475   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6476     vector_identity = build_zero_cst (vectype_out);
6477
6478   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6479   int i;
6480   tree def0;
6481   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6482     {
6483       gimple *new_stmt;
6484       tree mask = NULL_TREE;
6485       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6486         mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
6487
6488       /* Handle MINUS by adding the negative.  */
6489       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6490         {
6491           tree negated = make_ssa_name (vectype_out);
6492           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6493           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6494           def0 = negated;
6495         }
6496
6497       if (mask && mask_reduc_fn == IFN_LAST)
6498         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6499                                     vector_identity);
6500
6501       /* On the first iteration the input is simply the scalar phi
6502          result, and for subsequent iterations it is the output of
6503          the preceding operation.  */
6504       if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6505         {
6506           if (mask && mask_reduc_fn != IFN_LAST)
6507             new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6508                                                    def0, mask);
6509           else
6510             new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6511                                                    def0);
6512           /* For chained SLP reductions the output of the previous reduction
6513              operation serves as the input of the next. For the final statement
6514              the output cannot be a temporary - we reuse the original
6515              scalar destination of the last statement.  */
6516           if (i != vec_num - 1)
6517             {
6518               gimple_set_lhs (new_stmt, scalar_dest_var);
6519               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6520               gimple_set_lhs (new_stmt, reduc_var);
6521             }
6522         }
6523       else
6524         {
6525           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6526                                              reduc_var, def0);
6527           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6528           /* Remove the statement, so that we can use the same code paths
6529              as for statements that we've just created.  */
6530           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6531           gsi_remove (&tmp_gsi, true);
6532         }
6533
6534       if (i == vec_num - 1)
6535         {
6536           gimple_set_lhs (new_stmt, scalar_dest);
6537           vect_finish_replace_stmt (loop_vinfo,
6538                                     scalar_dest_def_info,
6539                                     new_stmt);
6540         }
6541       else
6542         vect_finish_stmt_generation (loop_vinfo,
6543                                      scalar_dest_def_info,
6544                                      new_stmt, gsi);
6545
6546       if (slp_node)
6547         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6548       else
6549         {
6550           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
6551           *vec_stmt = new_stmt;
6552         }
6553     }
6554
6555   return true;
6556 }
6557
6558 /* Function is_nonwrapping_integer_induction.
6559
6560    Check if STMT_VINO (which is part of loop LOOP) both increments and
6561    does not cause overflow.  */
6562
6563 static bool
6564 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
6565 {
6566   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
6567   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6568   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6569   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
6570   widest_int ni, max_loop_value, lhs_max;
6571   wi::overflow_type overflow = wi::OVF_NONE;
6572
6573   /* Make sure the loop is integer based.  */
6574   if (TREE_CODE (base) != INTEGER_CST
6575       || TREE_CODE (step) != INTEGER_CST)
6576     return false;
6577
6578   /* Check that the max size of the loop will not wrap.  */
6579
6580   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6581     return true;
6582
6583   if (! max_stmt_executions (loop, &ni))
6584     return false;
6585
6586   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6587                             &overflow);
6588   if (overflow)
6589     return false;
6590
6591   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6592                             TYPE_SIGN (lhs_type), &overflow);
6593   if (overflow)
6594     return false;
6595
6596   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6597           <= TYPE_PRECISION (lhs_type));
6598 }
6599
6600 /* Check if masking can be supported by inserting a conditional expression.
6601    CODE is the code for the operation.  COND_FN is the conditional internal
6602    function, if it exists.  VECTYPE_IN is the type of the vector input.  */
6603 static bool
6604 use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
6605                          tree vectype_in)
6606 {
6607   if (cond_fn != IFN_LAST
6608       && direct_internal_fn_supported_p (cond_fn, vectype_in,
6609                                          OPTIMIZE_FOR_SPEED))
6610     return false;
6611
6612   if (code.is_tree_code ())
6613     switch (tree_code (code))
6614       {
6615       case DOT_PROD_EXPR:
6616       case SAD_EXPR:
6617         return true;
6618
6619       default:
6620         break;
6621       }
6622   return false;
6623 }
6624
6625 /* Insert a conditional expression to enable masked vectorization.  CODE is the
6626    code for the operation.  VOP is the array of operands.  MASK is the loop
6627    mask.  GSI is a statement iterator used to place the new conditional
6628    expression.  */
6629 static void
6630 build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
6631                       gimple_stmt_iterator *gsi)
6632 {
6633   switch (tree_code (code))
6634     {
6635     case DOT_PROD_EXPR:
6636       {
6637         tree vectype = TREE_TYPE (vop[1]);
6638         tree zero = build_zero_cst (vectype);
6639         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6640         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6641                                                mask, vop[1], zero);
6642         gsi_insert_before (gsi, select, GSI_SAME_STMT);
6643         vop[1] = masked_op1;
6644         break;
6645       }
6646
6647     case SAD_EXPR:
6648       {
6649         tree vectype = TREE_TYPE (vop[1]);
6650         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6651         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6652                                                mask, vop[1], vop[0]);
6653         gsi_insert_before (gsi, select, GSI_SAME_STMT);
6654         vop[1] = masked_op1;
6655         break;
6656       }
6657
6658     default:
6659       gcc_unreachable ();
6660     }
6661 }
6662
6663 /* Function vectorizable_reduction.
6664
6665    Check if STMT_INFO performs a reduction operation that can be vectorized.
6666    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6667    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6668    Return true if STMT_INFO is vectorizable in this way.
6669
6670    This function also handles reduction idioms (patterns) that have been
6671    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
6672    may be of this form:
6673      X = pattern_expr (arg0, arg1, ..., X)
6674    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6675    sequence that had been detected and replaced by the pattern-stmt
6676    (STMT_INFO).
6677
6678    This function also handles reduction of condition expressions, for example:
6679      for (int i = 0; i < N; i++)
6680        if (a[i] < value)
6681          last = a[i];
6682    This is handled by vectorising the loop and creating an additional vector
6683    containing the loop indexes for which "a[i] < value" was true.  In the
6684    function epilogue this is reduced to a single max value and then used to
6685    index into the vector of results.
6686
6687    In some cases of reduction patterns, the type of the reduction variable X is
6688    different than the type of the other arguments of STMT_INFO.
6689    In such cases, the vectype that is used when transforming STMT_INFO into
6690    a vector stmt is different than the vectype that is used to determine the
6691    vectorization factor, because it consists of a different number of elements
6692    than the actual number of elements that are being operated upon in parallel.
6693
6694    For example, consider an accumulation of shorts into an int accumulator.
6695    On some targets it's possible to vectorize this pattern operating on 8
6696    shorts at a time (hence, the vectype for purposes of determining the
6697    vectorization factor should be V8HI); on the other hand, the vectype that
6698    is used to create the vector form is actually V4SI (the type of the result).
6699
6700    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6701    indicates what is the actual level of parallelism (V8HI in the example), so
6702    that the right vectorization factor would be derived.  This vectype
6703    corresponds to the type of arguments to the reduction stmt, and should *NOT*
6704    be used to create the vectorized stmt.  The right vectype for the vectorized
6705    stmt is obtained from the type of the result X:
6706       get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6707
6708    This means that, contrary to "regular" reductions (or "regular" stmts in
6709    general), the following equation:
6710       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6711    does *NOT* necessarily hold for reduction patterns.  */
6712
6713 bool
6714 vectorizable_reduction (loop_vec_info loop_vinfo,
6715                         stmt_vec_info stmt_info, slp_tree slp_node,
6716                         slp_instance slp_node_instance,
6717                         stmt_vector_for_cost *cost_vec)
6718 {
6719   tree vectype_in = NULL_TREE;
6720   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6721   enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
6722   stmt_vec_info cond_stmt_vinfo = NULL;
6723   int i;
6724   int ncopies;
6725   bool single_defuse_cycle = false;
6726   bool nested_cycle = false;
6727   bool double_reduc = false;
6728   int vec_num;
6729   tree tem;
6730   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6731   tree cond_reduc_val = NULL_TREE;
6732
6733   /* Make sure it was already recognized as a reduction computation.  */
6734   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6735       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
6736       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6737     return false;
6738
6739   /* The stmt we store reduction analysis meta on.  */
6740   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
6741   reduc_info->is_reduc_info = true;
6742
6743   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
6744     {
6745       if (is_a <gphi *> (stmt_info->stmt))
6746         {
6747           if (slp_node)
6748             {
6749               /* We eventually need to set a vector type on invariant
6750                  arguments.  */
6751               unsigned j;
6752               slp_tree child;
6753               FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
6754                 if (!vect_maybe_update_slp_op_vectype
6755                        (child, SLP_TREE_VECTYPE (slp_node)))
6756                   {
6757                     if (dump_enabled_p ())
6758                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6759                                        "incompatible vector types for "
6760                                        "invariants\n");
6761                     return false;
6762                   }
6763             }
6764           /* Analysis for double-reduction is done on the outer
6765              loop PHI, nested cycles have no further restrictions.  */
6766           STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
6767         }
6768       else
6769         STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6770       return true;
6771     }
6772
6773   stmt_vec_info orig_stmt_of_analysis = stmt_info;
6774   stmt_vec_info phi_info = stmt_info;
6775   if (!is_a <gphi *> (stmt_info->stmt))
6776     {
6777       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6778       return true;
6779     }
6780   if (slp_node)
6781     {
6782       slp_node_instance->reduc_phis = slp_node;
6783       /* ???  We're leaving slp_node to point to the PHIs, we only
6784          need it to get at the number of vector stmts which wasn't
6785          yet initialized for the instance root.  */
6786     }
6787   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
6788     {
6789       use_operand_p use_p;
6790       gimple *use_stmt;
6791       bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
6792                                  &use_p, &use_stmt);
6793       gcc_assert (res);
6794       phi_info = loop_vinfo->lookup_stmt (use_stmt);
6795     }
6796
6797   /* PHIs should not participate in patterns.  */
6798   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6799   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6800
6801   /* Verify following REDUC_IDX from the latch def leads us back to the PHI
6802      and compute the reduction chain length.  Discover the real
6803      reduction operation stmt on the way (stmt_info and slp_for_stmt_info).  */
6804   tree reduc_def
6805     = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6806                              loop_latch_edge
6807                                (gimple_bb (reduc_def_phi)->loop_father));
6808   unsigned reduc_chain_length = 0;
6809   bool only_slp_reduc_chain = true;
6810   stmt_info = NULL;
6811   slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
6812   while (reduc_def != PHI_RESULT (reduc_def_phi))
6813     {
6814       stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
6815       stmt_vec_info vdef = vect_stmt_to_vectorize (def);
6816       if (STMT_VINFO_REDUC_IDX (vdef) == -1)
6817         {
6818           if (dump_enabled_p ())
6819             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6820                              "reduction chain broken by patterns.\n");
6821           return false;
6822         }
6823       if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
6824         only_slp_reduc_chain = false;
6825       /* ???  For epilogue generation live members of the chain need
6826          to point back to the PHI via their original stmt for
6827          info_for_reduction to work.  */
6828       if (STMT_VINFO_LIVE_P (vdef))
6829         STMT_VINFO_REDUC_DEF (def) = phi_info;
6830       gimple_match_op op;
6831       if (!gimple_extract_op (vdef->stmt, &op))
6832         {
6833           if (dump_enabled_p ())
6834             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6835                              "reduction chain includes unsupported"
6836                              " statement type.\n");
6837           return false;
6838         }
6839       if (CONVERT_EXPR_CODE_P (op.code))
6840         {
6841           if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
6842             {
6843               if (dump_enabled_p ())
6844                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6845                                  "conversion in the reduction chain.\n");
6846               return false;
6847             }
6848         }
6849       else if (!stmt_info)
6850         /* First non-conversion stmt.  */
6851         stmt_info = vdef;
6852       reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
6853       reduc_chain_length++;
6854       if (!stmt_info && slp_node)
6855         slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
6856     }
6857   /* PHIs should not participate in patterns.  */
6858   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6859
6860   if (nested_in_vect_loop_p (loop, stmt_info))
6861     {
6862       loop = loop->inner;
6863       nested_cycle = true;
6864     }
6865
6866   /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
6867      element.  */
6868   if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6869     {
6870       gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
6871       stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
6872     }
6873   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6874     gcc_assert (slp_node
6875                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6876
6877   /* 1. Is vectorizable reduction?  */
6878   /* Not supportable if the reduction variable is used in the loop, unless
6879      it's a reduction chain.  */
6880   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6881       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6882     return false;
6883
6884   /* Reductions that are not used even in an enclosing outer-loop,
6885      are expected to be "live" (used out of the loop).  */
6886   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6887       && !STMT_VINFO_LIVE_P (stmt_info))
6888     return false;
6889
6890   /* 2. Has this been recognized as a reduction pattern?
6891
6892      Check if STMT represents a pattern that has been recognized
6893      in earlier analysis stages.  For stmts that represent a pattern,
6894      the STMT_VINFO_RELATED_STMT field records the last stmt in
6895      the original sequence that constitutes the pattern.  */
6896
6897   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6898   if (orig_stmt_info)
6899     {
6900       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6901       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6902     }
6903
6904   /* 3. Check the operands of the operation.  The first operands are defined
6905         inside the loop body. The last operand is the reduction variable,
6906         which is defined by the loop-header-phi.  */
6907
6908   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6909   STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
6910   gimple_match_op op;
6911   if (!gimple_extract_op (stmt_info->stmt, &op))
6912     gcc_unreachable ();
6913   bool lane_reduc_code_p = (op.code == DOT_PROD_EXPR
6914                             || op.code == WIDEN_SUM_EXPR
6915                             || op.code == SAD_EXPR);
6916
6917   if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
6918       && !SCALAR_FLOAT_TYPE_P (op.type))
6919     return false;
6920
6921   /* Do not try to vectorize bit-precision reductions.  */
6922   if (!type_has_mode_precision_p (op.type))
6923     return false;
6924
6925   /* For lane-reducing ops we're reducing the number of reduction PHIs
6926      which means the only use of that may be in the lane-reducing operation.  */
6927   if (lane_reduc_code_p
6928       && reduc_chain_length != 1
6929       && !only_slp_reduc_chain)
6930     {
6931       if (dump_enabled_p ())
6932         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6933                          "lane-reducing reduction with extra stmts.\n");
6934       return false;
6935     }
6936
6937   /* All uses but the last are expected to be defined in the loop.
6938      The last use is the reduction variable.  In case of nested cycle this
6939      assumption is not true: we use reduc_index to record the index of the
6940      reduction variable.  */
6941   slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
6942   /* We need to skip an extra operand for COND_EXPRs with embedded
6943      comparison.  */
6944   unsigned opno_adjust = 0;
6945   if (op.code == COND_EXPR && COMPARISON_CLASS_P (op.ops[0]))
6946     opno_adjust = 1;
6947   for (i = 0; i < (int) op.num_ops; i++)
6948     {
6949       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
6950       if (i == 0 && op.code == COND_EXPR)
6951         continue;
6952
6953       stmt_vec_info def_stmt_info;
6954       enum vect_def_type dt;
6955       if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
6956                                i + opno_adjust, &op.ops[i], &slp_op[i], &dt,
6957                                &tem, &def_stmt_info))
6958         {
6959           if (dump_enabled_p ())
6960             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6961                              "use not simple.\n");
6962           return false;
6963         }
6964       if (i == STMT_VINFO_REDUC_IDX (stmt_info))
6965         continue;
6966
6967       /* There should be only one cycle def in the stmt, the one
6968          leading to reduc_def.  */
6969       if (VECTORIZABLE_CYCLE_DEF (dt))
6970         return false;
6971
6972       /* To properly compute ncopies we are interested in the widest
6973          non-reduction input type in case we're looking at a widening
6974          accumulation that we later handle in vect_transform_reduction.  */
6975       if (lane_reduc_code_p
6976           && tem
6977           && (!vectype_in
6978               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6979                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem))))))
6980         vectype_in = tem;
6981
6982       if (op.code == COND_EXPR)
6983         {
6984           /* Record how the non-reduction-def value of COND_EXPR is defined.  */
6985           if (dt == vect_constant_def)
6986             {
6987               cond_reduc_dt = dt;
6988               cond_reduc_val = op.ops[i];
6989             }
6990           if (dt == vect_induction_def
6991               && def_stmt_info
6992               && is_nonwrapping_integer_induction (def_stmt_info, loop))
6993             {
6994               cond_reduc_dt = dt;
6995               cond_stmt_vinfo = def_stmt_info;
6996             }
6997         }
6998     }
6999   if (!vectype_in)
7000     vectype_in = STMT_VINFO_VECTYPE (phi_info);
7001   STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
7002
7003   enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
7004   STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
7005   /* If we have a condition reduction, see if we can simplify it further.  */
7006   if (v_reduc_type == COND_REDUCTION)
7007     {
7008       if (slp_node)
7009         return false;
7010
7011       /* When the condition uses the reduction value in the condition, fail.  */
7012       if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
7013         {
7014           if (dump_enabled_p ())
7015             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7016                              "condition depends on previous iteration\n");
7017           return false;
7018         }
7019
7020       if (reduc_chain_length == 1
7021           && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
7022                                              vectype_in, OPTIMIZE_FOR_SPEED))
7023         {
7024           if (dump_enabled_p ())
7025             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7026                              "optimizing condition reduction with"
7027                              " FOLD_EXTRACT_LAST.\n");
7028           STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
7029         }
7030       else if (cond_reduc_dt == vect_induction_def)
7031         {
7032           tree base
7033             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
7034           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
7035
7036           gcc_assert (TREE_CODE (base) == INTEGER_CST
7037                       && TREE_CODE (step) == INTEGER_CST);
7038           cond_reduc_val = NULL_TREE;
7039           enum tree_code cond_reduc_op_code = ERROR_MARK;
7040           tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
7041           if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
7042             ;
7043           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
7044              above base; punt if base is the minimum value of the type for
7045              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
7046           else if (tree_int_cst_sgn (step) == -1)
7047             {
7048               cond_reduc_op_code = MIN_EXPR;
7049               if (tree_int_cst_sgn (base) == -1)
7050                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7051               else if (tree_int_cst_lt (base,
7052                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
7053                 cond_reduc_val
7054                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
7055             }
7056           else
7057             {
7058               cond_reduc_op_code = MAX_EXPR;
7059               if (tree_int_cst_sgn (base) == 1)
7060                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7061               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
7062                                         base))
7063                 cond_reduc_val
7064                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
7065             }
7066           if (cond_reduc_val)
7067             {
7068               if (dump_enabled_p ())
7069                 dump_printf_loc (MSG_NOTE, vect_location,
7070                                  "condition expression based on "
7071                                  "integer induction.\n");
7072               STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
7073               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
7074                 = cond_reduc_val;
7075               STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
7076             }
7077         }
7078       else if (cond_reduc_dt == vect_constant_def)
7079         {
7080           enum vect_def_type cond_initial_dt;
7081           tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
7082           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
7083           if (cond_initial_dt == vect_constant_def
7084               && types_compatible_p (TREE_TYPE (cond_initial_val),
7085                                      TREE_TYPE (cond_reduc_val)))
7086             {
7087               tree e = fold_binary (LE_EXPR, boolean_type_node,
7088                                     cond_initial_val, cond_reduc_val);
7089               if (e && (integer_onep (e) || integer_zerop (e)))
7090                 {
7091                   if (dump_enabled_p ())
7092                     dump_printf_loc (MSG_NOTE, vect_location,
7093                                      "condition expression based on "
7094                                      "compile time constant.\n");
7095                   /* Record reduction code at analysis stage.  */
7096                   STMT_VINFO_REDUC_CODE (reduc_info)
7097                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
7098                   STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
7099                 }
7100             }
7101         }
7102     }
7103
7104   if (STMT_VINFO_LIVE_P (phi_info))
7105     return false;
7106
7107   if (slp_node)
7108     ncopies = 1;
7109   else
7110     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7111
7112   gcc_assert (ncopies >= 1);
7113
7114   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
7115
7116   if (nested_cycle)
7117     {
7118       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
7119                   == vect_double_reduction_def);
7120       double_reduc = true;
7121     }
7122
7123   /* 4.2. Check support for the epilog operation.
7124
7125           If STMT represents a reduction pattern, then the type of the
7126           reduction variable may be different than the type of the rest
7127           of the arguments.  For example, consider the case of accumulation
7128           of shorts into an int accumulator; The original code:
7129                         S1: int_a = (int) short_a;
7130           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
7131
7132           was replaced with:
7133                         STMT: int_acc = widen_sum <short_a, int_acc>
7134
7135           This means that:
7136           1. The tree-code that is used to create the vector operation in the
7137              epilog code (that reduces the partial results) is not the
7138              tree-code of STMT, but is rather the tree-code of the original
7139              stmt from the pattern that STMT is replacing.  I.e, in the example
7140              above we want to use 'widen_sum' in the loop, but 'plus' in the
7141              epilog.
7142           2. The type (mode) we use to check available target support
7143              for the vector operation to be created in the *epilog*, is
7144              determined by the type of the reduction variable (in the example
7145              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7146              However the type (mode) we use to check available target support
7147              for the vector operation to be created *inside the loop*, is
7148              determined by the type of the other arguments to STMT (in the
7149              example we'd check this: optab_handler (widen_sum_optab,
7150              vect_short_mode)).
7151
7152           This is contrary to "regular" reductions, in which the types of all
7153           the arguments are the same as the type of the reduction variable.
7154           For "regular" reductions we can therefore use the same vector type
7155           (and also the same tree-code) when generating the epilog code and
7156           when generating the code inside the loop.  */
7157
7158   code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
7159   STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
7160
7161   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7162   if (reduction_type == TREE_CODE_REDUCTION)
7163     {
7164       /* Check whether it's ok to change the order of the computation.
7165          Generally, when vectorizing a reduction we change the order of the
7166          computation.  This may change the behavior of the program in some
7167          cases, so we need to check that this is ok.  One exception is when
7168          vectorizing an outer-loop: the inner-loop is executed sequentially,
7169          and therefore vectorizing reductions in the inner-loop during
7170          outer-loop vectorization is safe.  Likewise when we are vectorizing
7171          a series of reductions using SLP and the VF is one the reductions
7172          are performed in scalar order.  */
7173       if (slp_node
7174           && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7175           && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
7176         ;
7177       else if (needs_fold_left_reduction_p (op.type, orig_code))
7178         {
7179           /* When vectorizing a reduction chain w/o SLP the reduction PHI
7180              is not directy used in stmt.  */
7181           if (!only_slp_reduc_chain
7182               && reduc_chain_length != 1)
7183             {
7184               if (dump_enabled_p ())
7185                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7186                                  "in-order reduction chain without SLP.\n");
7187               return false;
7188             }
7189           STMT_VINFO_REDUC_TYPE (reduc_info)
7190             = reduction_type = FOLD_LEFT_REDUCTION;
7191         }
7192       else if (!commutative_binary_op_p (orig_code, op.type)
7193                || !associative_binary_op_p (orig_code, op.type))
7194         {
7195           if (dump_enabled_p ())
7196             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7197                             "reduction: not commutative/associative");
7198           return false;
7199         }
7200     }
7201
7202   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7203       && ncopies > 1)
7204     {
7205       if (dump_enabled_p ())
7206         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7207                          "multiple types in double reduction or condition "
7208                          "reduction or fold-left reduction.\n");
7209       return false;
7210     }
7211
7212   internal_fn reduc_fn = IFN_LAST;
7213   if (reduction_type == TREE_CODE_REDUCTION
7214       || reduction_type == FOLD_LEFT_REDUCTION
7215       || reduction_type == INTEGER_INDUC_COND_REDUCTION
7216       || reduction_type == CONST_COND_REDUCTION)
7217     {
7218       if (reduction_type == FOLD_LEFT_REDUCTION
7219           ? fold_left_reduction_fn (orig_code, &reduc_fn)
7220           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7221         {
7222           if (reduc_fn != IFN_LAST
7223               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
7224                                                   OPTIMIZE_FOR_SPEED))
7225             {
7226               if (dump_enabled_p ())
7227                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7228                                  "reduc op not supported by target.\n");
7229
7230               reduc_fn = IFN_LAST;
7231             }
7232         }
7233       else
7234         {
7235           if (!nested_cycle || double_reduc)
7236             {
7237               if (dump_enabled_p ())
7238                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7239                                  "no reduc code for scalar code.\n");
7240
7241               return false;
7242             }
7243         }
7244     }
7245   else if (reduction_type == COND_REDUCTION)
7246     {
7247       int scalar_precision
7248         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
7249       cr_index_scalar_type = make_unsigned_type (scalar_precision);
7250       cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
7251                                                 vectype_out);
7252
7253       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7254                                           OPTIMIZE_FOR_SPEED))
7255         reduc_fn = IFN_REDUC_MAX;
7256     }
7257   STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
7258
7259   if (reduction_type != EXTRACT_LAST_REDUCTION
7260       && (!nested_cycle || double_reduc)
7261       && reduc_fn == IFN_LAST
7262       && !nunits_out.is_constant ())
7263     {
7264       if (dump_enabled_p ())
7265         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7266                          "missing target support for reduction on"
7267                          " variable-length vectors.\n");
7268       return false;
7269     }
7270
7271   /* For SLP reductions, see if there is a neutral value we can use.  */
7272   tree neutral_op = NULL_TREE;
7273   if (slp_node)
7274     {
7275       tree initial_value = NULL_TREE;
7276       if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
7277         initial_value = vect_phi_initial_value (reduc_def_phi);
7278       neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7279                                              orig_code, initial_value);
7280     }
7281
7282   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7283     {
7284       /* We can't support in-order reductions of code such as this:
7285
7286            for (int i = 0; i < n1; ++i)
7287              for (int j = 0; j < n2; ++j)
7288                l += a[j];
7289
7290          since GCC effectively transforms the loop when vectorizing:
7291
7292            for (int i = 0; i < n1 / VF; ++i)
7293              for (int j = 0; j < n2; ++j)
7294                for (int k = 0; k < VF; ++k)
7295                  l += a[j];
7296
7297          which is a reassociation of the original operation.  */
7298       if (dump_enabled_p ())
7299         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7300                          "in-order double reduction not supported.\n");
7301
7302       return false;
7303     }
7304
7305   if (reduction_type == FOLD_LEFT_REDUCTION
7306       && slp_node
7307       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7308     {
7309       /* We cannot use in-order reductions in this case because there is
7310          an implicit reassociation of the operations involved.  */
7311       if (dump_enabled_p ())
7312         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7313                          "in-order unchained SLP reductions not supported.\n");
7314       return false;
7315     }
7316
7317   /* For double reductions, and for SLP reductions with a neutral value,
7318      we construct a variable-length initial vector by loading a vector
7319      full of the neutral value and then shift-and-inserting the start
7320      values into the low-numbered elements.  */
7321   if ((double_reduc || neutral_op)
7322       && !nunits_out.is_constant ()
7323       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7324                                           vectype_out, OPTIMIZE_FOR_SPEED))
7325     {
7326       if (dump_enabled_p ())
7327         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7328                          "reduction on variable-length vectors requires"
7329                          " target support for a vector-shift-and-insert"
7330                          " operation.\n");
7331       return false;
7332     }
7333
7334   /* Check extra constraints for variable-length unchained SLP reductions.  */
7335   if (STMT_SLP_TYPE (stmt_info)
7336       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7337       && !nunits_out.is_constant ())
7338     {
7339       /* We checked above that we could build the initial vector when
7340          there's a neutral element value.  Check here for the case in
7341          which each SLP statement has its own initial value and in which
7342          that value needs to be repeated for every instance of the
7343          statement within the initial vector.  */
7344       unsigned int group_size = SLP_TREE_LANES (slp_node);
7345       if (!neutral_op
7346           && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
7347                                               TREE_TYPE (vectype_out)))
7348         {
7349           if (dump_enabled_p ())
7350             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7351                              "unsupported form of SLP reduction for"
7352                              " variable-length vectors: cannot build"
7353                              " initial vector.\n");
7354           return false;
7355         }
7356       /* The epilogue code relies on the number of elements being a multiple
7357          of the group size.  The duplicate-and-interleave approach to setting
7358          up the initial vector does too.  */
7359       if (!multiple_p (nunits_out, group_size))
7360         {
7361           if (dump_enabled_p ())
7362             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7363                              "unsupported form of SLP reduction for"
7364                              " variable-length vectors: the vector size"
7365                              " is not a multiple of the number of results.\n");
7366           return false;
7367         }
7368     }
7369
7370   if (reduction_type == COND_REDUCTION)
7371     {
7372       widest_int ni;
7373
7374       if (! max_loop_iterations (loop, &ni))
7375         {
7376           if (dump_enabled_p ())
7377             dump_printf_loc (MSG_NOTE, vect_location,
7378                              "loop count not known, cannot create cond "
7379                              "reduction.\n");
7380           return false;
7381         }
7382       /* Convert backedges to iterations.  */
7383       ni += 1;
7384
7385       /* The additional index will be the same type as the condition.  Check
7386          that the loop can fit into this less one (because we'll use up the
7387          zero slot for when there are no matches).  */
7388       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7389       if (wi::geu_p (ni, wi::to_widest (max_index)))
7390         {
7391           if (dump_enabled_p ())
7392             dump_printf_loc (MSG_NOTE, vect_location,
7393                              "loop size is greater than data size.\n");
7394           return false;
7395         }
7396     }
7397
7398   /* In case the vectorization factor (VF) is bigger than the number
7399      of elements that we can fit in a vectype (nunits), we have to generate
7400      more than one vector stmt - i.e - we need to "unroll" the
7401      vector stmt by a factor VF/nunits.  For more details see documentation
7402      in vectorizable_operation.  */
7403
7404   /* If the reduction is used in an outer loop we need to generate
7405      VF intermediate results, like so (e.g. for ncopies=2):
7406         r0 = phi (init, r0)
7407         r1 = phi (init, r1)
7408         r0 = x0 + r0;
7409         r1 = x1 + r1;
7410     (i.e. we generate VF results in 2 registers).
7411     In this case we have a separate def-use cycle for each copy, and therefore
7412     for each copy we get the vector def for the reduction variable from the
7413     respective phi node created for this copy.
7414
7415     Otherwise (the reduction is unused in the loop nest), we can combine
7416     together intermediate results, like so (e.g. for ncopies=2):
7417         r = phi (init, r)
7418         r = x0 + r;
7419         r = x1 + r;
7420    (i.e. we generate VF/2 results in a single register).
7421    In this case for each copy we get the vector def for the reduction variable
7422    from the vectorized reduction operation generated in the previous iteration.
7423
7424    This only works when we see both the reduction PHI and its only consumer
7425    in vectorizable_reduction and there are no intermediate stmts
7426    participating.  When unrolling we want each unrolled iteration to have its
7427    own reduction accumulator since one of the main goals of unrolling a
7428    reduction is to reduce the aggregate loop-carried latency.  */
7429   if (ncopies > 1
7430       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7431       && reduc_chain_length == 1
7432       && loop_vinfo->suggested_unroll_factor == 1)
7433     single_defuse_cycle = true;
7434
7435   if (single_defuse_cycle || lane_reduc_code_p)
7436     {
7437       gcc_assert (op.code != COND_EXPR);
7438
7439       /* 4. Supportable by target?  */
7440       bool ok = true;
7441
7442       /* 4.1. check support for the operation in the loop
7443
7444          This isn't necessary for the lane reduction codes, since they
7445          can only be produced by pattern matching, and it's up to the
7446          pattern matcher to test for support.  The main reason for
7447          specifically skipping this step is to avoid rechecking whether
7448          mixed-sign dot-products can be implemented using signed
7449          dot-products.  */
7450       machine_mode vec_mode = TYPE_MODE (vectype_in);
7451       if (!lane_reduc_code_p
7452           && !directly_supported_p (op.code, vectype_in, optab_vector))
7453         {
7454           if (dump_enabled_p ())
7455             dump_printf (MSG_NOTE, "op not supported by target.\n");
7456           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
7457               || !vect_can_vectorize_without_simd_p (op.code))
7458             ok = false;
7459           else
7460             if (dump_enabled_p ())
7461               dump_printf (MSG_NOTE, "proceeding using word mode.\n");
7462         }
7463
7464       if (vect_emulated_vector_p (vectype_in)
7465           && !vect_can_vectorize_without_simd_p (op.code))
7466         {
7467           if (dump_enabled_p ())
7468             dump_printf (MSG_NOTE, "using word mode not possible.\n");
7469           return false;
7470         }
7471
7472       /* lane-reducing operations have to go through vect_transform_reduction.
7473          For the other cases try without the single cycle optimization.  */
7474       if (!ok)
7475         {
7476           if (lane_reduc_code_p)
7477             return false;
7478           else
7479             single_defuse_cycle = false;
7480         }
7481     }
7482   STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
7483
7484   /* If the reduction stmt is one of the patterns that have lane
7485      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
7486   if ((ncopies > 1 && ! single_defuse_cycle)
7487       && lane_reduc_code_p)
7488     {
7489       if (dump_enabled_p ())
7490         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7491                          "multi def-use cycle not possible for lane-reducing "
7492                          "reduction operation\n");
7493       return false;
7494     }
7495
7496   if (slp_node
7497       && !(!single_defuse_cycle
7498            && !lane_reduc_code_p
7499            && reduction_type != FOLD_LEFT_REDUCTION))
7500     for (i = 0; i < (int) op.num_ops; i++)
7501       if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_in))
7502         {
7503           if (dump_enabled_p ())
7504             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7505                              "incompatible vector types for invariants\n");
7506           return false;
7507         }
7508
7509   if (slp_node)
7510     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7511   else
7512     vec_num = 1;
7513
7514   vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
7515                              reduction_type, ncopies, cost_vec);
7516   /* Cost the reduction op inside the loop if transformed via
7517      vect_transform_reduction.  Otherwise this is costed by the
7518      separate vectorizable_* routines.  */
7519   if (single_defuse_cycle || lane_reduc_code_p)
7520     {
7521       int factor = 1;
7522       if (vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info))
7523         /* Three dot-products and a subtraction.  */
7524         factor = 4;
7525       record_stmt_cost (cost_vec, ncopies * factor, vector_stmt,
7526                         stmt_info, 0, vect_body);
7527     }
7528
7529   if (dump_enabled_p ()
7530       && reduction_type == FOLD_LEFT_REDUCTION)
7531     dump_printf_loc (MSG_NOTE, vect_location,
7532                      "using an in-order (fold-left) reduction.\n");
7533   STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
7534   /* All but single defuse-cycle optimized, lane-reducing and fold-left
7535      reductions go through their own vectorizable_* routines.  */
7536   if (!single_defuse_cycle
7537       && !lane_reduc_code_p
7538       && reduction_type != FOLD_LEFT_REDUCTION)
7539     {
7540       stmt_vec_info tem
7541         = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
7542       if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
7543         {
7544           gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
7545           tem = REDUC_GROUP_FIRST_ELEMENT (tem);
7546         }
7547       STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
7548       STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
7549     }
7550   else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
7551     {
7552       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7553       internal_fn cond_fn = get_conditional_internal_fn (op.code, op.type);
7554
7555       if (reduction_type != FOLD_LEFT_REDUCTION
7556           && !use_mask_by_cond_expr_p (op.code, cond_fn, vectype_in)
7557           && (cond_fn == IFN_LAST
7558               || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7559                                                   OPTIMIZE_FOR_SPEED)))
7560         {
7561           if (dump_enabled_p ())
7562             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7563                              "can't operate on partial vectors because"
7564                              " no conditional operation is available.\n");
7565           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7566         }
7567       else if (reduction_type == FOLD_LEFT_REDUCTION
7568                && reduc_fn == IFN_LAST
7569                && !expand_vec_cond_expr_p (vectype_in,
7570                                            truth_type_for (vectype_in),
7571                                            SSA_NAME))
7572         {
7573           if (dump_enabled_p ())
7574             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7575                              "can't operate on partial vectors because"
7576                              " no conditional operation is available.\n");
7577           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7578         }
7579       else
7580         vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7581                                vectype_in, NULL);
7582     }
7583   return true;
7584 }
7585
7586 /* STMT_INFO is a dot-product reduction whose multiplication operands
7587    have different signs.  Emit a sequence to emulate the operation
7588    using a series of signed DOT_PROD_EXPRs and return the last
7589    statement generated.  VEC_DEST is the result of the vector operation
7590    and VOP lists its inputs.  */
7591
7592 static gassign *
7593 vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
7594                              gimple_stmt_iterator *gsi, tree vec_dest,
7595                              tree vop[3])
7596 {
7597   tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest));
7598   tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0]));
7599   tree narrow_elttype = TREE_TYPE (narrow_vectype);
7600   gimple *new_stmt;
7601
7602   /* Make VOP[0] the unsigned operand VOP[1] the signed operand.  */
7603   if (!TYPE_UNSIGNED (TREE_TYPE (vop[0])))
7604     std::swap (vop[0], vop[1]);
7605
7606   /* Convert all inputs to signed types.  */
7607   for (int i = 0; i < 3; ++i)
7608     if (TYPE_UNSIGNED (TREE_TYPE (vop[i])))
7609       {
7610         tree tmp = make_ssa_name (signed_type_for (TREE_TYPE (vop[i])));
7611         new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]);
7612         vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7613         vop[i] = tmp;
7614       }
7615
7616   /* In the comments below we assume 8-bit inputs for simplicity,
7617      but the approach works for any full integer type.  */
7618
7619   /* Create a vector of -128.  */
7620   tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype);
7621   tree min_narrow = build_vector_from_val (narrow_vectype,
7622                                            min_narrow_elttype);
7623
7624   /* Create a vector of 64.  */
7625   auto half_wi = wi::lrshift (wi::to_wide (min_narrow_elttype), 1);
7626   tree half_narrow = wide_int_to_tree (narrow_elttype, half_wi);
7627   half_narrow = build_vector_from_val (narrow_vectype, half_narrow);
7628
7629   /* Emit: SUB_RES = VOP[0] - 128.  */
7630   tree sub_res = make_ssa_name (narrow_vectype);
7631   new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow);
7632   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7633
7634   /* Emit:
7635
7636        STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
7637        STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
7638        STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
7639
7640      on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
7641      Doing the two 64 * y steps first allows more time to compute x.  */
7642   tree stage1 = make_ssa_name (wide_vectype);
7643   new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR,
7644                                   vop[1], half_narrow, vop[2]);
7645   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7646
7647   tree stage2 = make_ssa_name (wide_vectype);
7648   new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR,
7649                                   vop[1], half_narrow, stage1);
7650   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7651
7652   tree stage3 = make_ssa_name (wide_vectype);
7653   new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR,
7654                                   sub_res, vop[1], stage2);
7655   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7656
7657   /* Convert STAGE3 to the reduction type.  */
7658   return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3);
7659 }
7660
7661 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
7662    value.  */
7663
7664 bool
7665 vect_transform_reduction (loop_vec_info loop_vinfo,
7666                           stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
7667                           gimple **vec_stmt, slp_tree slp_node)
7668 {
7669   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7670   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7671   int i;
7672   int ncopies;
7673   int vec_num;
7674
7675   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7676   gcc_assert (reduc_info->is_reduc_info);
7677
7678   if (nested_in_vect_loop_p (loop, stmt_info))
7679     {
7680       loop = loop->inner;
7681       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
7682     }
7683
7684   gimple_match_op op;
7685   if (!gimple_extract_op (stmt_info->stmt, &op))
7686     gcc_unreachable ();
7687   gcc_assert (op.code.is_tree_code ());
7688   auto code = tree_code (op.code);
7689
7690   /* All uses but the last are expected to be defined in the loop.
7691      The last use is the reduction variable.  In case of nested cycle this
7692      assumption is not true: we use reduc_index to record the index of the
7693      reduction variable.  */
7694   stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
7695   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7696   int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
7697   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7698
7699   if (slp_node)
7700     {
7701       ncopies = 1;
7702       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7703     }
7704   else
7705     {
7706       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7707       vec_num = 1;
7708     }
7709
7710   internal_fn cond_fn = get_conditional_internal_fn (code);
7711   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7712   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
7713
7714   /* Transform.  */
7715   tree new_temp = NULL_TREE;
7716   auto_vec<tree> vec_oprnds0;
7717   auto_vec<tree> vec_oprnds1;
7718   auto_vec<tree> vec_oprnds2;
7719   tree def0;
7720
7721   if (dump_enabled_p ())
7722     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7723
7724   /* FORNOW: Multiple types are not supported for condition.  */
7725   if (code == COND_EXPR)
7726     gcc_assert (ncopies == 1);
7727
7728   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7729
7730   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7731   if (reduction_type == FOLD_LEFT_REDUCTION)
7732     {
7733       internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
7734       return vectorize_fold_left_reduction
7735           (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
7736            reduc_fn, op.ops, vectype_in, reduc_index, masks);
7737     }
7738
7739   bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
7740   gcc_assert (single_defuse_cycle
7741               || code == DOT_PROD_EXPR
7742               || code == WIDEN_SUM_EXPR
7743               || code == SAD_EXPR);
7744
7745   /* Create the destination vector  */
7746   tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
7747   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7748
7749   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
7750                      single_defuse_cycle && reduc_index == 0
7751                      ? NULL_TREE : op.ops[0], &vec_oprnds0,
7752                      single_defuse_cycle && reduc_index == 1
7753                      ? NULL_TREE : op.ops[1], &vec_oprnds1,
7754                      op.num_ops == 3
7755                      && !(single_defuse_cycle && reduc_index == 2)
7756                      ? op.ops[2] : NULL_TREE, &vec_oprnds2);
7757   if (single_defuse_cycle)
7758     {
7759       gcc_assert (!slp_node);
7760       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7761                                      op.ops[reduc_index],
7762                                      reduc_index == 0 ? &vec_oprnds0
7763                                      : (reduc_index == 1 ? &vec_oprnds1
7764                                         : &vec_oprnds2));
7765     }
7766
7767   bool emulated_mixed_dot_prod
7768     = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
7769   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7770     {
7771       gimple *new_stmt;
7772       tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7773       if (masked_loop_p && !mask_by_cond_expr)
7774         {
7775           /* No conditional ifns have been defined for dot-product yet.  */
7776           gcc_assert (code != DOT_PROD_EXPR);
7777
7778           /* Make sure that the reduction accumulator is vop[0].  */
7779           if (reduc_index == 1)
7780             {
7781               gcc_assert (commutative_tree_code (code));
7782               std::swap (vop[0], vop[1]);
7783             }
7784           tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7785                                           vectype_in, i);
7786           gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7787                                                     vop[0], vop[1], vop[0]);
7788           new_temp = make_ssa_name (vec_dest, call);
7789           gimple_call_set_lhs (call, new_temp);
7790           gimple_call_set_nothrow (call, true);
7791           vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
7792           new_stmt = call;
7793         }
7794       else
7795         {
7796           if (op.num_ops == 3)
7797             vop[2] = vec_oprnds2[i];
7798
7799           if (masked_loop_p && mask_by_cond_expr)
7800             {
7801               tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7802                                               vectype_in, i);
7803               build_vect_cond_expr (code, vop, mask, gsi);
7804             }
7805
7806           if (emulated_mixed_dot_prod)
7807             new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
7808                                                     vec_dest, vop);
7809           else
7810             new_stmt = gimple_build_assign (vec_dest, code,
7811                                             vop[0], vop[1], vop[2]);
7812           new_temp = make_ssa_name (vec_dest, new_stmt);
7813           gimple_assign_set_lhs (new_stmt, new_temp);
7814           vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7815         }
7816
7817       if (slp_node)
7818         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7819       else if (single_defuse_cycle
7820                && i < ncopies - 1)
7821         {
7822           if (reduc_index == 0)
7823             vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
7824           else if (reduc_index == 1)
7825             vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
7826           else if (reduc_index == 2)
7827             vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
7828         }
7829       else
7830         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7831     }
7832
7833   if (!slp_node)
7834     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7835
7836   return true;
7837 }
7838
7839 /* Transform phase of a cycle PHI.  */
7840
7841 bool
7842 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
7843                           stmt_vec_info stmt_info, gimple **vec_stmt,
7844                           slp_tree slp_node, slp_instance slp_node_instance)
7845 {
7846   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7847   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7848   int i;
7849   int ncopies;
7850   int j;
7851   bool nested_cycle = false;
7852   int vec_num;
7853
7854   if (nested_in_vect_loop_p (loop, stmt_info))
7855     {
7856       loop = loop->inner;
7857       nested_cycle = true;
7858     }
7859
7860   stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
7861   reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
7862   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7863   gcc_assert (reduc_info->is_reduc_info);
7864
7865   if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
7866       || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
7867     /* Leave the scalar phi in place.  */
7868     return true;
7869
7870   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7871   /* For a nested cycle we do not fill the above.  */
7872   if (!vectype_in)
7873     vectype_in = STMT_VINFO_VECTYPE (stmt_info);
7874   gcc_assert (vectype_in);
7875
7876   if (slp_node)
7877     {
7878       /* The size vect_schedule_slp_instance computes is off for us.  */
7879       vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7880                                       * SLP_TREE_LANES (slp_node), vectype_in);
7881       ncopies = 1;
7882     }
7883   else
7884     {
7885       vec_num = 1;
7886       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7887     }
7888
7889   /* Check whether we should use a single PHI node and accumulate
7890      vectors to one before the backedge.  */
7891   if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
7892     ncopies = 1;
7893
7894   /* Create the destination vector  */
7895   gphi *phi = as_a <gphi *> (stmt_info->stmt);
7896   tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
7897                                                vectype_out);
7898
7899   /* Get the loop-entry arguments.  */
7900   tree vec_initial_def = NULL_TREE;
7901   auto_vec<tree> vec_initial_defs;
7902   if (slp_node)
7903     {
7904       vec_initial_defs.reserve (vec_num);
7905       if (nested_cycle)
7906         {
7907           unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
7908           vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
7909                              &vec_initial_defs);
7910         }
7911       else
7912         {
7913           gcc_assert (slp_node == slp_node_instance->reduc_phis);
7914           vec<tree> &initial_values = reduc_info->reduc_initial_values;
7915           vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
7916
7917           unsigned int num_phis = stmts.length ();
7918           if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
7919             num_phis = 1;
7920           initial_values.reserve (num_phis);
7921           for (unsigned int i = 0; i < num_phis; ++i)
7922             {
7923               gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
7924               initial_values.quick_push (vect_phi_initial_value (this_phi));
7925             }
7926           if (vec_num == 1)
7927             vect_find_reusable_accumulator (loop_vinfo, reduc_info);
7928           if (!initial_values.is_empty ())
7929             {
7930               tree initial_value
7931                 = (num_phis == 1 ? initial_values[0] : NULL_TREE);
7932               code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
7933               tree neutral_op
7934                 = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7935                                             code, initial_value);
7936               get_initial_defs_for_reduction (loop_vinfo, reduc_info,
7937                                               &vec_initial_defs, vec_num,
7938                                               stmts.length (), neutral_op);
7939             }
7940         }
7941     }
7942   else
7943     {
7944       /* Get at the scalar def before the loop, that defines the initial
7945          value of the reduction variable.  */
7946       tree initial_def = vect_phi_initial_value (phi);
7947       reduc_info->reduc_initial_values.safe_push (initial_def);
7948       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
7949          and we can't use zero for induc_val, use initial_def.  Similarly
7950          for REDUC_MIN and initial_def larger than the base.  */
7951       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
7952         {
7953           tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
7954           if (TREE_CODE (initial_def) == INTEGER_CST
7955               && !integer_zerop (induc_val)
7956               && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
7957                    && tree_int_cst_lt (initial_def, induc_val))
7958                   || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
7959                       && tree_int_cst_lt (induc_val, initial_def))))
7960             {
7961               induc_val = initial_def;
7962               /* Communicate we used the initial_def to epilouge
7963                  generation.  */
7964               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
7965             }
7966           vec_initial_def = build_vector_from_val (vectype_out, induc_val);
7967         }
7968       else if (nested_cycle)
7969         {
7970           /* Do not use an adjustment def as that case is not supported
7971              correctly if ncopies is not one.  */
7972           vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
7973                                          ncopies, initial_def,
7974                                          &vec_initial_defs);
7975         }
7976       else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
7977                || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
7978         /* Fill the initial vector with the initial scalar value.  */
7979         vec_initial_def
7980           = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
7981                                            initial_def, initial_def);
7982       else
7983         {
7984           if (ncopies == 1)
7985             vect_find_reusable_accumulator (loop_vinfo, reduc_info);
7986           if (!reduc_info->reduc_initial_values.is_empty ())
7987             {
7988               initial_def = reduc_info->reduc_initial_values[0];
7989               code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
7990               tree neutral_op
7991                 = neutral_op_for_reduction (TREE_TYPE (initial_def),
7992                                             code, initial_def);
7993               gcc_assert (neutral_op);
7994               /* Try to simplify the vector initialization by applying an
7995                  adjustment after the reduction has been performed.  */
7996               if (!reduc_info->reused_accumulator
7997                   && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
7998                   && !operand_equal_p (neutral_op, initial_def))
7999                 {
8000                   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
8001                     = initial_def;
8002                   initial_def = neutral_op;
8003                 }
8004               vec_initial_def
8005                 = get_initial_def_for_reduction (loop_vinfo, reduc_info,
8006                                                  initial_def, neutral_op);
8007             }
8008         }
8009     }
8010
8011   if (vec_initial_def)
8012     {
8013       vec_initial_defs.create (ncopies);
8014       for (i = 0; i < ncopies; ++i)
8015         vec_initial_defs.quick_push (vec_initial_def);
8016     }
8017
8018   if (auto *accumulator = reduc_info->reused_accumulator)
8019     {
8020       tree def = accumulator->reduc_input;
8021       if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8022         {
8023           unsigned int nreduc;
8024           bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
8025                                             (TREE_TYPE (def)),
8026                                           TYPE_VECTOR_SUBPARTS (vectype_out),
8027                                           &nreduc);
8028           gcc_assert (res);
8029           gimple_seq stmts = NULL;
8030           /* Reduce the single vector to a smaller one.  */
8031           if (nreduc != 1)
8032             {
8033               /* Perform the reduction in the appropriate type.  */
8034               tree rvectype = vectype_out;
8035               if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
8036                                               TREE_TYPE (TREE_TYPE (def))))
8037                 rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
8038                                               TYPE_VECTOR_SUBPARTS
8039                                                 (vectype_out));
8040               def = vect_create_partial_epilog (def, rvectype,
8041                                                 STMT_VINFO_REDUC_CODE
8042                                                   (reduc_info),
8043                                                 &stmts);
8044             }
8045           /* The epilogue loop might use a different vector mode, like
8046              VNx2DI vs. V2DI.  */
8047           if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
8048             {
8049               tree reduc_type = build_vector_type_for_mode
8050                 (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
8051               def = gimple_convert (&stmts, reduc_type, def);
8052             }
8053           /* Adjust the input so we pick up the partially reduced value
8054              for the skip edge in vect_create_epilog_for_reduction.  */
8055           accumulator->reduc_input = def;
8056           /* And the reduction could be carried out using a different sign.  */
8057           if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8058             def = gimple_convert (&stmts, vectype_out, def);
8059           if (loop_vinfo->main_loop_edge)
8060             {
8061               /* While we'd like to insert on the edge this will split
8062                  blocks and disturb bookkeeping, we also will eventually
8063                  need this on the skip edge.  Rely on sinking to
8064                  fixup optimal placement and insert in the pred.  */
8065               gimple_stmt_iterator gsi
8066                 = gsi_last_bb (loop_vinfo->main_loop_edge->src);
8067               /* Insert before a cond that eventually skips the
8068                  epilogue.  */
8069               if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
8070                 gsi_prev (&gsi);
8071               gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
8072             }
8073           else
8074             gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
8075                                               stmts);
8076         }
8077       if (loop_vinfo->main_loop_edge)
8078         vec_initial_defs[0]
8079           = vect_get_main_loop_result (loop_vinfo, def,
8080                                        vec_initial_defs[0]);
8081       else
8082         vec_initial_defs.safe_push (def);
8083     }
8084
8085   /* Generate the reduction PHIs upfront.  */
8086   for (i = 0; i < vec_num; i++)
8087     {
8088       tree vec_init_def = vec_initial_defs[i];
8089       for (j = 0; j < ncopies; j++)
8090         {
8091           /* Create the reduction-phi that defines the reduction
8092              operand.  */
8093           gphi *new_phi = create_phi_node (vec_dest, loop->header);
8094
8095           /* Set the loop-entry arg of the reduction-phi.  */
8096           if (j != 0 && nested_cycle)
8097             vec_init_def = vec_initial_defs[j];
8098           add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
8099                        UNKNOWN_LOCATION);
8100
8101           /* The loop-latch arg is set in epilogue processing.  */
8102
8103           if (slp_node)
8104             SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
8105           else
8106             {
8107               if (j == 0)
8108                 *vec_stmt = new_phi;
8109               STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8110             }
8111         }
8112     }
8113
8114   return true;
8115 }
8116
8117 /* Vectorizes LC PHIs.  */
8118
8119 bool
8120 vectorizable_lc_phi (loop_vec_info loop_vinfo,
8121                      stmt_vec_info stmt_info, gimple **vec_stmt,
8122                      slp_tree slp_node)
8123 {
8124   if (!loop_vinfo
8125       || !is_a <gphi *> (stmt_info->stmt)
8126       || gimple_phi_num_args (stmt_info->stmt) != 1)
8127     return false;
8128
8129   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
8130       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
8131     return false;
8132
8133   if (!vec_stmt) /* transformation not required.  */
8134     {
8135       /* Deal with copies from externs or constants that disguise as
8136          loop-closed PHI nodes (PR97886).  */
8137       if (slp_node
8138           && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
8139                                                 SLP_TREE_VECTYPE (slp_node)))
8140         {
8141           if (dump_enabled_p ())
8142             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8143                              "incompatible vector types for invariants\n");
8144           return false;
8145         }
8146       STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
8147       return true;
8148     }
8149
8150   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8151   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8152   basic_block bb = gimple_bb (stmt_info->stmt);
8153   edge e = single_pred_edge (bb);
8154   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8155   auto_vec<tree> vec_oprnds;
8156   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
8157                      !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
8158                      gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
8159   for (unsigned i = 0; i < vec_oprnds.length (); i++)
8160     {
8161       /* Create the vectorized LC PHI node.  */
8162       gphi *new_phi = create_phi_node (vec_dest, bb);
8163       add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
8164       if (slp_node)
8165         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
8166       else
8167         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8168     }
8169   if (!slp_node)
8170     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8171
8172   return true;
8173 }
8174
8175 /* Vectorizes PHIs.  */
8176
8177 bool
8178 vectorizable_phi (vec_info *,
8179                   stmt_vec_info stmt_info, gimple **vec_stmt,
8180                   slp_tree slp_node, stmt_vector_for_cost *cost_vec)
8181 {
8182   if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
8183     return false;
8184
8185   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
8186     return false;
8187
8188   tree vectype = SLP_TREE_VECTYPE (slp_node);
8189
8190   if (!vec_stmt) /* transformation not required.  */
8191     {
8192       slp_tree child;
8193       unsigned i;
8194       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
8195         if (!child)
8196           {
8197             if (dump_enabled_p ())
8198               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8199                                "PHI node with unvectorized backedge def\n");
8200             return false;
8201           }
8202         else if (!vect_maybe_update_slp_op_vectype (child, vectype))
8203           {
8204             if (dump_enabled_p ())
8205               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8206                                "incompatible vector types for invariants\n");
8207             return false;
8208           }
8209         else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8210                  && !useless_type_conversion_p (vectype,
8211                                                 SLP_TREE_VECTYPE (child)))
8212           {
8213             /* With bools we can have mask and non-mask precision vectors
8214                or different non-mask precisions.  while pattern recog is
8215                supposed to guarantee consistency here bugs in it can cause
8216                mismatches (PR103489 and PR103800 for example).
8217                Deal with them here instead of ICEing later.  */
8218             if (dump_enabled_p ())
8219               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8220                                "incompatible vector type setup from "
8221                                "bool pattern detection\n");
8222             return false;
8223           }
8224
8225       /* For single-argument PHIs assume coalescing which means zero cost
8226          for the scalar and the vector PHIs.  This avoids artificially
8227          favoring the vector path (but may pessimize it in some cases).  */
8228       if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
8229         record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
8230                           vector_stmt, stmt_info, vectype, 0, vect_body);
8231       STMT_VINFO_TYPE (stmt_info) = phi_info_type;
8232       return true;
8233     }
8234
8235   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8236   basic_block bb = gimple_bb (stmt_info->stmt);
8237   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8238   auto_vec<gphi *> new_phis;
8239   for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
8240     {
8241       slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8242
8243       /* Skip not yet vectorized defs.  */
8244       if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8245           && SLP_TREE_VEC_STMTS (child).is_empty ())
8246         continue;
8247
8248       auto_vec<tree> vec_oprnds;
8249       vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
8250       if (!new_phis.exists ())
8251         {
8252           new_phis.create (vec_oprnds.length ());
8253           for (unsigned j = 0; j < vec_oprnds.length (); j++)
8254             {
8255               /* Create the vectorized LC PHI node.  */
8256               new_phis.quick_push (create_phi_node (vec_dest, bb));
8257               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phis[j]);
8258             }
8259         }
8260       edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
8261       for (unsigned j = 0; j < vec_oprnds.length (); j++)
8262         add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
8263     }
8264   /* We should have at least one already vectorized child.  */
8265   gcc_assert (new_phis.exists ());
8266
8267   return true;
8268 }
8269
8270 /* Return true if VECTYPE represents a vector that requires lowering
8271    by the vector lowering pass.  */
8272
8273 bool
8274 vect_emulated_vector_p (tree vectype)
8275 {
8276   return (!VECTOR_MODE_P (TYPE_MODE (vectype))
8277           && (!VECTOR_BOOLEAN_TYPE_P (vectype)
8278               || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
8279 }
8280
8281 /* Return true if we can emulate CODE on an integer mode representation
8282    of a vector.  */
8283
8284 bool
8285 vect_can_vectorize_without_simd_p (tree_code code)
8286 {
8287   switch (code)
8288     {
8289     case PLUS_EXPR:
8290     case MINUS_EXPR:
8291     case NEGATE_EXPR:
8292     case BIT_AND_EXPR:
8293     case BIT_IOR_EXPR:
8294     case BIT_XOR_EXPR:
8295     case BIT_NOT_EXPR:
8296       return true;
8297
8298     default:
8299       return false;
8300     }
8301 }
8302
8303 /* Likewise, but taking a code_helper.  */
8304
8305 bool
8306 vect_can_vectorize_without_simd_p (code_helper code)
8307 {
8308   return (code.is_tree_code ()
8309           && vect_can_vectorize_without_simd_p (tree_code (code)));
8310 }
8311
8312 /* Create vector init for vectorized iv.  */
8313 static tree
8314 vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
8315                                tree step_expr, poly_uint64 nunits,
8316                                tree vectype,
8317                                enum vect_induction_op_type induction_type)
8318 {
8319   unsigned HOST_WIDE_INT const_nunits;
8320   tree vec_shift, vec_init, new_name;
8321   unsigned i;
8322   tree itype = TREE_TYPE (vectype);
8323
8324   /* iv_loop is the loop to be vectorized. Create:
8325      vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr).  */
8326   new_name = gimple_convert (stmts, itype, init_expr);
8327   switch (induction_type)
8328     {
8329     case vect_step_op_shr:
8330     case vect_step_op_shl:
8331       /* Build the Initial value from shift_expr.  */
8332       vec_init = gimple_build_vector_from_val (stmts,
8333                                                vectype,
8334                                                new_name);
8335       vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
8336                                 build_zero_cst (itype), step_expr);
8337       vec_init = gimple_build (stmts,
8338                                (induction_type == vect_step_op_shr
8339                                 ? RSHIFT_EXPR : LSHIFT_EXPR),
8340                                vectype, vec_init, vec_shift);
8341       break;
8342
8343     case vect_step_op_neg:
8344       {
8345         vec_init = gimple_build_vector_from_val (stmts,
8346                                                  vectype,
8347                                                  new_name);
8348         tree vec_neg = gimple_build (stmts, NEGATE_EXPR,
8349                                      vectype, vec_init);
8350         /* The encoding has 2 interleaved stepped patterns.  */
8351         vec_perm_builder sel (nunits, 2, 3);
8352         sel.quick_grow (6);
8353         for (i = 0; i < 3; i++)
8354           {
8355             sel[2 * i] = i;
8356             sel[2 * i + 1] = i + nunits;
8357           }
8358         vec_perm_indices indices (sel, 2, nunits);
8359         /* Don't use vect_gen_perm_mask_checked since can_vec_perm_const_p may
8360            fail when vec_init is const vector. In that situation vec_perm is not
8361            really needed.  */
8362         tree perm_mask_even
8363           = vect_gen_perm_mask_any (vectype, indices);
8364         vec_init = gimple_build (stmts, VEC_PERM_EXPR,
8365                                  vectype,
8366                                  vec_init, vec_neg,
8367                                  perm_mask_even);
8368       }
8369       break;
8370
8371     case vect_step_op_mul:
8372       {
8373         /* Use unsigned mult to avoid UD integer overflow.  */
8374         gcc_assert (nunits.is_constant (&const_nunits));
8375         tree utype = unsigned_type_for (itype);
8376         tree uvectype = build_vector_type (utype,
8377                                            TYPE_VECTOR_SUBPARTS (vectype));
8378         new_name = gimple_convert (stmts, utype, new_name);
8379         vec_init = gimple_build_vector_from_val (stmts,
8380                                                  uvectype,
8381                                                  new_name);
8382         tree_vector_builder elts (uvectype, const_nunits, 1);
8383         tree elt_step = build_one_cst (utype);
8384
8385         elts.quick_push (elt_step);
8386         for (i = 1; i < const_nunits; i++)
8387           {
8388             /* Create: new_name_i = new_name + step_expr.  */
8389             elt_step = gimple_build (stmts, MULT_EXPR,
8390                                      utype, elt_step, step_expr);
8391             elts.quick_push (elt_step);
8392           }
8393         /* Create a vector from [new_name_0, new_name_1, ...,
8394            new_name_nunits-1].  */
8395         tree vec_mul = gimple_build_vector (stmts, &elts);
8396         vec_init = gimple_build (stmts, MULT_EXPR, uvectype,
8397                                  vec_init, vec_mul);
8398         vec_init = gimple_convert (stmts, vectype, vec_init);
8399       }
8400       break;
8401
8402     default:
8403       gcc_unreachable ();
8404     }
8405
8406   return vec_init;
8407 }
8408
8409 /* Peel init_expr by skip_niter for induction_type.  */
8410 tree
8411 vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
8412                              tree skip_niters, tree step_expr,
8413                              enum vect_induction_op_type induction_type)
8414 {
8415   gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST);
8416   tree type = TREE_TYPE (init_expr);
8417   unsigned prec = TYPE_PRECISION (type);
8418   switch (induction_type)
8419     {
8420     case vect_step_op_neg:
8421       if (TREE_INT_CST_LOW (skip_niters) % 2)
8422         init_expr = gimple_build (stmts, NEGATE_EXPR, type, init_expr);
8423       /* else no change.  */
8424       break;
8425
8426     case vect_step_op_shr:
8427     case vect_step_op_shl:
8428       skip_niters = gimple_convert (stmts, type, skip_niters);
8429       step_expr = gimple_build (stmts, MULT_EXPR, type, step_expr, skip_niters);
8430       /* When shift mount >= precision, need to avoid UD.
8431          In the original loop, there's no UD, and according to semantic,
8432          init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr.  */
8433       if (!tree_fits_uhwi_p (step_expr)
8434           || tree_to_uhwi (step_expr) >= prec)
8435         {
8436           if (induction_type == vect_step_op_shl
8437               || TYPE_UNSIGNED (type))
8438             init_expr = build_zero_cst (type);
8439           else
8440             init_expr = gimple_build (stmts, RSHIFT_EXPR, type,
8441                                       init_expr,
8442                                       wide_int_to_tree (type, prec - 1));
8443         }
8444       else
8445         init_expr = gimple_build (stmts, (induction_type == vect_step_op_shr
8446                                           ? RSHIFT_EXPR : LSHIFT_EXPR),
8447                                   type, init_expr, step_expr);
8448       break;
8449
8450     case vect_step_op_mul:
8451       {
8452         tree utype = unsigned_type_for (type);
8453         init_expr = gimple_convert (stmts, utype, init_expr);
8454         unsigned skipn = TREE_INT_CST_LOW (skip_niters);
8455         wide_int begin = wi::to_wide (step_expr);
8456         for (unsigned i = 0; i != skipn - 1; i++)
8457           begin = wi::mul (begin, wi::to_wide (step_expr));
8458         tree mult_expr = wide_int_to_tree (utype, begin);
8459         init_expr = gimple_build (stmts, MULT_EXPR, utype, init_expr, mult_expr);
8460         init_expr = gimple_convert (stmts, type, init_expr);
8461       }
8462       break;
8463
8464     default:
8465       gcc_unreachable ();
8466     }
8467
8468   return init_expr;
8469 }
8470
8471 /* Create vector step for vectorized iv.  */
8472 static tree
8473 vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
8474                                poly_uint64 vf,
8475                                enum vect_induction_op_type induction_type)
8476 {
8477   tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
8478   tree new_name = NULL;
8479   /* Step should be pow (step, vf) for mult induction.  */
8480   if (induction_type == vect_step_op_mul)
8481     {
8482       gcc_assert (vf.is_constant ());
8483       wide_int begin = wi::to_wide (step_expr);
8484
8485       for (unsigned i = 0; i != vf.to_constant () - 1; i++)
8486         begin = wi::mul (begin, wi::to_wide (step_expr));
8487
8488       new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin);
8489     }
8490   else if (induction_type == vect_step_op_neg)
8491     /* Do nothing.  */
8492     ;
8493   else
8494     new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr),
8495                              expr, step_expr);
8496   return new_name;
8497 }
8498
8499 static tree
8500 vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
8501                                    stmt_vec_info stmt_info,
8502                                    tree new_name, tree vectype,
8503                                    enum vect_induction_op_type induction_type)
8504 {
8505   /* No step is needed for neg induction.  */
8506   if (induction_type == vect_step_op_neg)
8507     return NULL;
8508
8509   tree t = unshare_expr (new_name);
8510   gcc_assert (CONSTANT_CLASS_P (new_name)
8511               || TREE_CODE (new_name) == SSA_NAME);
8512   tree new_vec = build_vector_from_val (vectype, t);
8513   tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
8514                                     new_vec, vectype, NULL);
8515   return vec_step;
8516 }
8517
8518 /* Update vectorized iv with vect_step, induc_def is init.  */
8519 static tree
8520 vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
8521                           tree induc_def, tree vec_step,
8522                           enum vect_induction_op_type induction_type)
8523 {
8524   tree vec_def = induc_def;
8525   switch (induction_type)
8526     {
8527     case vect_step_op_mul:
8528       {
8529         /* Use unsigned mult to avoid UD integer overflow.  */
8530         tree uvectype
8531           = build_vector_type (unsigned_type_for (TREE_TYPE (vectype)),
8532                                TYPE_VECTOR_SUBPARTS (vectype));
8533         vec_def = gimple_convert (stmts, uvectype, vec_def);
8534         vec_step = gimple_convert (stmts, uvectype, vec_step);
8535         vec_def = gimple_build (stmts, MULT_EXPR, uvectype,
8536                                 vec_def, vec_step);
8537         vec_def = gimple_convert (stmts, vectype, vec_def);
8538       }
8539       break;
8540
8541     case vect_step_op_shr:
8542       vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype,
8543                               vec_def, vec_step);
8544       break;
8545
8546     case vect_step_op_shl:
8547       vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype,
8548                               vec_def, vec_step);
8549       break;
8550     case vect_step_op_neg:
8551       vec_def = induc_def;
8552       /* Do nothing.  */
8553       break;
8554     default:
8555       gcc_unreachable ();
8556     }
8557
8558   return vec_def;
8559
8560 }
8561
8562 /* Return true if vectorizer can peel for nonlinear iv.  */
8563 bool
8564 vect_can_peel_nonlinear_iv_p (loop_vec_info loop_vinfo,
8565                               enum vect_induction_op_type induction_type)
8566 {
8567   tree niters_skip;
8568   /* Init_expr will be update by vect_update_ivs_after_vectorizer,
8569      if niters is unkown:
8570      For shift, when shift mount >= precision, there would be UD.
8571      For mult, don't known how to generate
8572      init_expr * pow (step, niters) for variable niters.
8573      For neg, it should be ok, since niters of vectorized main loop
8574      will always be multiple of 2.  */
8575   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8576       && induction_type != vect_step_op_neg)
8577     {
8578       if (dump_enabled_p ())
8579         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8580                          "Peeling for epilogue is not supported"
8581                          " for nonlinear induction except neg"
8582                          " when iteration count is unknown.\n");
8583       return false;
8584     }
8585
8586   /* Also doens't support peel for neg when niter is variable.
8587      ??? generate something like niter_expr & 1 ? init_expr : -init_expr?  */
8588   niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
8589   if ((niters_skip != NULL_TREE
8590        && TREE_CODE (niters_skip) != INTEGER_CST)
8591       || (!vect_use_loop_mask_for_alignment_p (loop_vinfo)
8592           && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0))
8593     {
8594       if (dump_enabled_p ())
8595         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8596                          "Peeling for alignement is not supported"
8597                          " for nonlinear induction when niters_skip"
8598                          " is not constant.\n");
8599       return false;
8600     }
8601
8602   return true;
8603 }
8604
8605 /* Function vectorizable_induction
8606
8607    Check if STMT_INFO performs an nonlinear induction computation that can be
8608    vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
8609    a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
8610    basic block.
8611    Return true if STMT_INFO is vectorizable in this way.  */
8612
8613 static bool
8614 vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
8615                                   stmt_vec_info stmt_info,
8616                                   gimple **vec_stmt, slp_tree slp_node,
8617                                   stmt_vector_for_cost *cost_vec)
8618 {
8619   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8620   unsigned ncopies;
8621   bool nested_in_vect_loop = false;
8622   class loop *iv_loop;
8623   tree vec_def;
8624   edge pe = loop_preheader_edge (loop);
8625   basic_block new_bb;
8626   tree vec_init, vec_step;
8627   tree new_name;
8628   gimple *new_stmt;
8629   gphi *induction_phi;
8630   tree induc_def, vec_dest;
8631   tree init_expr, step_expr;
8632   tree niters_skip;
8633   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8634   unsigned i;
8635   gimple_stmt_iterator si;
8636
8637   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
8638
8639   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8640   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8641   enum vect_induction_op_type induction_type
8642     = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
8643
8644   gcc_assert (induction_type > vect_step_op_add);
8645
8646   if (slp_node)
8647     ncopies = 1;
8648   else
8649     ncopies = vect_get_num_copies (loop_vinfo, vectype);
8650   gcc_assert (ncopies >= 1);
8651
8652   /* FORNOW. Only handle nonlinear induction in the same loop.  */
8653   if (nested_in_vect_loop_p (loop, stmt_info))
8654     {
8655       if (dump_enabled_p ())
8656         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8657                          "nonlinear induction in nested loop.\n");
8658       return false;
8659     }
8660
8661   iv_loop = loop;
8662   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
8663
8664   /* TODO: Support slp for nonlinear iv. There should be separate vector iv
8665      update for each iv and a permutation to generate wanted vector iv.  */
8666   if (slp_node)
8667     {
8668       if (dump_enabled_p ())
8669         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8670                          "SLP induction not supported for nonlinear"
8671                          " induction.\n");
8672       return false;
8673     }
8674
8675   if (!vect_can_peel_nonlinear_iv_p (loop_vinfo, induction_type))
8676     return false;
8677
8678   if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
8679     {
8680       if (dump_enabled_p ())
8681         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8682                          "floating point nonlinear induction vectorization"
8683                          " not supported.\n");
8684       return false;
8685     }
8686
8687   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
8688   init_expr = vect_phi_initial_value (phi);
8689   gcc_assert (step_expr != NULL_TREE && init_expr != NULL
8690               && TREE_CODE (step_expr) == INTEGER_CST);
8691   /* step_expr should be aligned with init_expr,
8692      .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used.  */
8693   step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
8694
8695   if (TREE_CODE (init_expr) == INTEGER_CST)
8696     init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
8697   else
8698     gcc_assert (tree_nop_conversion_p (TREE_TYPE (vectype),
8699                                        TREE_TYPE (init_expr)));
8700
8701   switch (induction_type)
8702     {
8703     case vect_step_op_neg:
8704       if (TREE_CODE (init_expr) != INTEGER_CST
8705           && TREE_CODE (init_expr) != REAL_CST)
8706         {
8707           /* Check for backend support of NEGATE_EXPR and vec_perm.  */
8708           if (!directly_supported_p (NEGATE_EXPR, vectype))
8709             return false;
8710
8711           /* The encoding has 2 interleaved stepped patterns.  */
8712           vec_perm_builder sel (nunits, 2, 3);
8713           machine_mode mode = TYPE_MODE (vectype);
8714           sel.quick_grow (6);
8715           for (i = 0; i < 3; i++)
8716             {
8717               sel[i * 2] = i;
8718               sel[i * 2 + 1] = i + nunits;
8719             }
8720           vec_perm_indices indices (sel, 2, nunits);
8721           if (!can_vec_perm_const_p (mode, mode, indices))
8722             return false;
8723         }
8724       break;
8725
8726     case vect_step_op_mul:
8727       {
8728         /* Check for backend support of MULT_EXPR.  */
8729         if (!directly_supported_p (MULT_EXPR, vectype))
8730           return false;
8731
8732         /* ?? How to construct vector step for variable number vector.
8733            [ 1, step, pow (step, 2), pow (step, 4), .. ].  */
8734         if (!vf.is_constant ())
8735           return false;
8736       }
8737       break;
8738
8739     case vect_step_op_shr:
8740       /* Check for backend support of RSHIFT_EXPR.  */
8741       if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
8742         return false;
8743
8744       /* Don't shift more than type precision to avoid UD.  */
8745       if (!tree_fits_uhwi_p (step_expr)
8746           || maybe_ge (nunits * tree_to_uhwi (step_expr),
8747                        TYPE_PRECISION (TREE_TYPE (init_expr))))
8748         return false;
8749       break;
8750
8751     case vect_step_op_shl:
8752       /* Check for backend support of RSHIFT_EXPR.  */
8753       if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
8754         return false;
8755
8756       /* Don't shift more than type precision to avoid UD.  */
8757       if (!tree_fits_uhwi_p (step_expr)
8758           || maybe_ge (nunits * tree_to_uhwi (step_expr),
8759                        TYPE_PRECISION (TREE_TYPE (init_expr))))
8760         return false;
8761
8762       break;
8763
8764     default:
8765       gcc_unreachable ();
8766     }
8767
8768   if (!vec_stmt) /* transformation not required.  */
8769     {
8770       unsigned inside_cost = 0, prologue_cost = 0;
8771       /* loop cost for vec_loop. Neg induction doesn't have any
8772          inside_cost.  */
8773       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
8774                                       stmt_info, 0, vect_body);
8775
8776       /* loop cost for vec_loop. Neg induction doesn't have any
8777          inside_cost.  */
8778       if (induction_type == vect_step_op_neg)
8779         inside_cost = 0;
8780
8781       /* prologue cost for vec_init and vec_step.  */
8782       prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
8783                                         stmt_info, 0, vect_prologue);
8784
8785       if (dump_enabled_p ())
8786         dump_printf_loc (MSG_NOTE, vect_location,
8787                          "vect_model_induction_cost: inside_cost = %d, "
8788                          "prologue_cost = %d. \n", inside_cost,
8789                          prologue_cost);
8790
8791       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
8792       DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
8793       return true;
8794     }
8795
8796   /* Transform.  */
8797
8798   /* Compute a vector variable, initialized with the first VF values of
8799      the induction variable.  E.g., for an iv with IV_PHI='X' and
8800      evolution S, for a vector of 4 units, we want to compute:
8801      [X, X + S, X + 2*S, X + 3*S].  */
8802
8803   if (dump_enabled_p ())
8804     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
8805
8806   pe = loop_preheader_edge (iv_loop);
8807   /* Find the first insertion point in the BB.  */
8808   basic_block bb = gimple_bb (phi);
8809   si = gsi_after_labels (bb);
8810
8811   gimple_seq stmts = NULL;
8812
8813   niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
8814   /* If we are using the loop mask to "peel" for alignment then we need
8815      to adjust the start value here.  */
8816   if (niters_skip != NULL_TREE)
8817     init_expr = vect_peel_nonlinear_iv_init (&stmts, init_expr, niters_skip,
8818                                              step_expr, induction_type);
8819
8820   vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr,
8821                                             step_expr, nunits, vectype,
8822                                             induction_type);
8823   if (stmts)
8824     {
8825       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8826       gcc_assert (!new_bb);
8827     }
8828
8829   stmts = NULL;
8830   new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
8831                                             vf, induction_type);
8832   if (stmts)
8833     {
8834       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8835       gcc_assert (!new_bb);
8836     }
8837
8838   vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
8839                                                 new_name, vectype,
8840                                                 induction_type);
8841   /* Create the following def-use cycle:
8842      loop prolog:
8843      vec_init = ...
8844      vec_step = ...
8845      loop:
8846      vec_iv = PHI <vec_init, vec_loop>
8847      ...
8848      STMT
8849      ...
8850      vec_loop = vec_iv + vec_step;  */
8851
8852   /* Create the induction-phi that defines the induction-operand.  */
8853   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
8854   induction_phi = create_phi_node (vec_dest, iv_loop->header);
8855   induc_def = PHI_RESULT (induction_phi);
8856
8857   /* Create the iv update inside the loop.  */
8858   stmts = NULL;
8859   vec_def = vect_update_nonlinear_iv (&stmts, vectype,
8860                                       induc_def, vec_step,
8861                                       induction_type);
8862
8863   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8864   new_stmt = SSA_NAME_DEF_STMT (vec_def);
8865
8866   /* Set the arguments of the phi node:  */
8867   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8868   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8869                UNKNOWN_LOCATION);
8870
8871   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
8872   *vec_stmt = induction_phi;
8873
8874   /* In case that vectorization factor (VF) is bigger than the number
8875      of elements that we can fit in a vectype (nunits), we have to generate
8876      more than one vector stmt - i.e - we need to "unroll" the
8877      vector stmt by a factor VF/nunits.  For more details see documentation
8878      in vectorizable_operation.  */
8879
8880   if (ncopies > 1)
8881     {
8882       stmts = NULL;
8883       /* FORNOW. This restriction should be relaxed.  */
8884       gcc_assert (!nested_in_vect_loop);
8885
8886       new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
8887                                                 nunits, induction_type);
8888
8889       vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
8890                                                     new_name, vectype,
8891                                                     induction_type);
8892       vec_def = induc_def;
8893       for (i = 1; i < ncopies; i++)
8894         {
8895           /* vec_i = vec_prev + vec_step.  */
8896           stmts = NULL;
8897           vec_def = vect_update_nonlinear_iv (&stmts, vectype,
8898                                               vec_def, vec_step,
8899                                               induction_type);
8900           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8901           new_stmt = SSA_NAME_DEF_STMT (vec_def);
8902           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8903         }
8904     }
8905
8906   if (dump_enabled_p ())
8907     dump_printf_loc (MSG_NOTE, vect_location,
8908                      "transform induction: created def-use cycle: %G%G",
8909                      (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
8910
8911   return true;
8912 }
8913
8914 /* Function vectorizable_induction
8915
8916    Check if STMT_INFO performs an induction computation that can be vectorized.
8917    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
8918    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
8919    Return true if STMT_INFO is vectorizable in this way.  */
8920
8921 bool
8922 vectorizable_induction (loop_vec_info loop_vinfo,
8923                         stmt_vec_info stmt_info,
8924                         gimple **vec_stmt, slp_tree slp_node,
8925                         stmt_vector_for_cost *cost_vec)
8926 {
8927   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8928   unsigned ncopies;
8929   bool nested_in_vect_loop = false;
8930   class loop *iv_loop;
8931   tree vec_def;
8932   edge pe = loop_preheader_edge (loop);
8933   basic_block new_bb;
8934   tree new_vec, vec_init, vec_step, t;
8935   tree new_name;
8936   gimple *new_stmt;
8937   gphi *induction_phi;
8938   tree induc_def, vec_dest;
8939   tree init_expr, step_expr;
8940   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8941   unsigned i;
8942   tree expr;
8943   gimple_stmt_iterator si;
8944   enum vect_induction_op_type induction_type
8945     = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
8946
8947   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
8948   if (!phi)
8949     return false;
8950
8951   if (!STMT_VINFO_RELEVANT_P (stmt_info))
8952     return false;
8953
8954   /* Make sure it was recognized as induction computation.  */
8955   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
8956     return false;
8957
8958   /* Handle nonlinear induction in a separate place.  */
8959   if (induction_type != vect_step_op_add)
8960     return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
8961                                              vec_stmt, slp_node, cost_vec);
8962
8963   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8964   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8965
8966   if (slp_node)
8967     ncopies = 1;
8968   else
8969     ncopies = vect_get_num_copies (loop_vinfo, vectype);
8970   gcc_assert (ncopies >= 1);
8971
8972   /* FORNOW. These restrictions should be relaxed.  */
8973   if (nested_in_vect_loop_p (loop, stmt_info))
8974     {
8975       imm_use_iterator imm_iter;
8976       use_operand_p use_p;
8977       gimple *exit_phi;
8978       edge latch_e;
8979       tree loop_arg;
8980
8981       if (ncopies > 1)
8982         {
8983           if (dump_enabled_p ())
8984             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8985                              "multiple types in nested loop.\n");
8986           return false;
8987         }
8988
8989       exit_phi = NULL;
8990       latch_e = loop_latch_edge (loop->inner);
8991       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
8992       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
8993         {
8994           gimple *use_stmt = USE_STMT (use_p);
8995           if (is_gimple_debug (use_stmt))
8996             continue;
8997
8998           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
8999             {
9000               exit_phi = use_stmt;
9001               break;
9002             }
9003         }
9004       if (exit_phi)
9005         {
9006           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
9007           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
9008                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
9009             {
9010               if (dump_enabled_p ())
9011                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9012                                  "inner-loop induction only used outside "
9013                                  "of the outer vectorized loop.\n");
9014               return false;
9015             }
9016         }
9017
9018       nested_in_vect_loop = true;
9019       iv_loop = loop->inner;
9020     }
9021   else
9022     iv_loop = loop;
9023   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9024
9025   if (slp_node && !nunits.is_constant ())
9026     {
9027       /* The current SLP code creates the step value element-by-element.  */
9028       if (dump_enabled_p ())
9029         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9030                          "SLP induction not supported for variable-length"
9031                          " vectors.\n");
9032       return false;
9033     }
9034
9035   if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
9036     {
9037       if (dump_enabled_p ())
9038         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9039                          "floating point induction vectorization disabled\n");
9040       return false;
9041     }
9042
9043   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9044   gcc_assert (step_expr != NULL_TREE);
9045   tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
9046
9047   /* Check for backend support of PLUS/MINUS_EXPR. */
9048   if (!directly_supported_p (PLUS_EXPR, step_vectype)
9049       || !directly_supported_p (MINUS_EXPR, step_vectype))
9050     return false;
9051
9052   if (!vec_stmt) /* transformation not required.  */
9053     {
9054       unsigned inside_cost = 0, prologue_cost = 0;
9055       if (slp_node)
9056         {
9057           /* We eventually need to set a vector type on invariant
9058              arguments.  */
9059           unsigned j;
9060           slp_tree child;
9061           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9062             if (!vect_maybe_update_slp_op_vectype
9063                 (child, SLP_TREE_VECTYPE (slp_node)))
9064               {
9065                 if (dump_enabled_p ())
9066                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9067                                    "incompatible vector types for "
9068                                    "invariants\n");
9069                 return false;
9070               }
9071           /* loop cost for vec_loop.  */
9072           inside_cost
9073             = record_stmt_cost (cost_vec,
9074                                 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
9075                                 vector_stmt, stmt_info, 0, vect_body);
9076           /* prologue cost for vec_init (if not nested) and step.  */
9077           prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
9078                                             scalar_to_vec,
9079                                             stmt_info, 0, vect_prologue);
9080         }
9081       else /* if (!slp_node) */
9082         {
9083           /* loop cost for vec_loop.  */
9084           inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9085                                           stmt_info, 0, vect_body);
9086           /* prologue cost for vec_init and vec_step.  */
9087           prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9088                                             stmt_info, 0, vect_prologue);
9089         }
9090       if (dump_enabled_p ())
9091         dump_printf_loc (MSG_NOTE, vect_location,
9092                          "vect_model_induction_cost: inside_cost = %d, "
9093                          "prologue_cost = %d .\n", inside_cost,
9094                          prologue_cost);
9095
9096       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9097       DUMP_VECT_SCOPE ("vectorizable_induction");
9098       return true;
9099     }
9100
9101   /* Transform.  */
9102
9103   /* Compute a vector variable, initialized with the first VF values of
9104      the induction variable.  E.g., for an iv with IV_PHI='X' and
9105      evolution S, for a vector of 4 units, we want to compute:
9106      [X, X + S, X + 2*S, X + 3*S].  */
9107
9108   if (dump_enabled_p ())
9109     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9110
9111   pe = loop_preheader_edge (iv_loop);
9112   /* Find the first insertion point in the BB.  */
9113   basic_block bb = gimple_bb (phi);
9114   si = gsi_after_labels (bb);
9115
9116   /* For SLP induction we have to generate several IVs as for example
9117      with group size 3 we need
9118        [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
9119        [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2].  */
9120   if (slp_node)
9121     {
9122       /* Enforced above.  */
9123       unsigned int const_nunits = nunits.to_constant ();
9124
9125       /* The initial values are vectorized, but any lanes > group_size
9126          need adjustment.  */
9127       slp_tree init_node
9128         = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
9129
9130       /* Gather steps.  Since we do not vectorize inductions as
9131          cycles we have to reconstruct the step from SCEV data.  */
9132       unsigned group_size = SLP_TREE_LANES (slp_node);
9133       tree *steps = XALLOCAVEC (tree, group_size);
9134       tree *inits = XALLOCAVEC (tree, group_size);
9135       stmt_vec_info phi_info;
9136       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
9137         {
9138           steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
9139           if (!init_node)
9140             inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
9141                                            pe->dest_idx);
9142         }
9143
9144       /* Now generate the IVs.  */
9145       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
9146       gcc_assert ((const_nunits * nvects) % group_size == 0);
9147       unsigned nivs;
9148       if (nested_in_vect_loop)
9149         nivs = nvects;
9150       else
9151         {
9152           /* Compute the number of distinct IVs we need.  First reduce
9153              group_size if it is a multiple of const_nunits so we get
9154              one IV for a group_size of 4 but const_nunits 2.  */
9155           unsigned group_sizep = group_size;
9156           if (group_sizep % const_nunits == 0)
9157             group_sizep = group_sizep / const_nunits;
9158           nivs = least_common_multiple (group_sizep,
9159                                         const_nunits) / const_nunits;
9160         }
9161       tree stept = TREE_TYPE (step_vectype);
9162       tree lupdate_mul = NULL_TREE;
9163       if (!nested_in_vect_loop)
9164         {
9165           /* The number of iterations covered in one vector iteration.  */
9166           unsigned lup_mul = (nvects * const_nunits) / group_size;
9167           lupdate_mul
9168             = build_vector_from_val (step_vectype,
9169                                      SCALAR_FLOAT_TYPE_P (stept)
9170                                      ? build_real_from_wide (stept, lup_mul,
9171                                                              UNSIGNED)
9172                                      : build_int_cstu (stept, lup_mul));
9173         }
9174       tree peel_mul = NULL_TREE;
9175       gimple_seq init_stmts = NULL;
9176       if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
9177         {
9178           if (SCALAR_FLOAT_TYPE_P (stept))
9179             peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
9180                                      LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
9181           else
9182             peel_mul = gimple_convert (&init_stmts, stept,
9183                                        LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
9184           peel_mul = gimple_build_vector_from_val (&init_stmts,
9185                                                    step_vectype, peel_mul);
9186         }
9187       unsigned ivn;
9188       auto_vec<tree> vec_steps;
9189       for (ivn = 0; ivn < nivs; ++ivn)
9190         {
9191           tree_vector_builder step_elts (step_vectype, const_nunits, 1);
9192           tree_vector_builder init_elts (vectype, const_nunits, 1);
9193           tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
9194           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
9195             {
9196               /* The scalar steps of the IVs.  */
9197               tree elt = steps[(ivn*const_nunits + eltn) % group_size];
9198               elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
9199               step_elts.quick_push (elt);
9200               if (!init_node)
9201                 {
9202                   /* The scalar inits of the IVs if not vectorized.  */
9203                   elt = inits[(ivn*const_nunits + eltn) % group_size];
9204                   if (!useless_type_conversion_p (TREE_TYPE (vectype),
9205                                                   TREE_TYPE (elt)))
9206                     elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
9207                                         TREE_TYPE (vectype), elt);
9208                   init_elts.quick_push (elt);
9209                 }
9210               /* The number of steps to add to the initial values.  */
9211               unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
9212               mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
9213                                    ? build_real_from_wide (stept,
9214                                                            mul_elt, UNSIGNED)
9215                                    : build_int_cstu (stept, mul_elt));
9216             }
9217           vec_step = gimple_build_vector (&init_stmts, &step_elts);
9218           vec_steps.safe_push (vec_step);
9219           tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
9220           if (peel_mul)
9221             step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
9222                                      step_mul, peel_mul);
9223           if (!init_node)
9224             vec_init = gimple_build_vector (&init_stmts, &init_elts);
9225
9226           /* Create the induction-phi that defines the induction-operand.  */
9227           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
9228                                             "vec_iv_");
9229           induction_phi = create_phi_node (vec_dest, iv_loop->header);
9230           induc_def = PHI_RESULT (induction_phi);
9231
9232           /* Create the iv update inside the loop  */
9233           tree up = vec_step;
9234           if (lupdate_mul)
9235             up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9236                                vec_step, lupdate_mul);
9237           gimple_seq stmts = NULL;
9238           vec_def = gimple_convert (&stmts, step_vectype, induc_def);
9239           vec_def = gimple_build (&stmts,
9240                                   PLUS_EXPR, step_vectype, vec_def, up);
9241           vec_def = gimple_convert (&stmts, vectype, vec_def);
9242           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9243           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9244                        UNKNOWN_LOCATION);
9245
9246           if (init_node)
9247             vec_init = vect_get_slp_vect_def (init_node, ivn);
9248           if (!nested_in_vect_loop
9249               && !integer_zerop (step_mul))
9250             {
9251               vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
9252               up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9253                                  vec_step, step_mul);
9254               vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
9255                                       vec_def, up);
9256               vec_init = gimple_convert (&init_stmts, vectype, vec_def);
9257             }
9258
9259           /* Set the arguments of the phi node:  */
9260           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9261
9262           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
9263         }
9264       if (!nested_in_vect_loop)
9265         {
9266           /* Fill up to the number of vectors we need for the whole group.  */
9267           nivs = least_common_multiple (group_size,
9268                                         const_nunits) / const_nunits;
9269           vec_steps.reserve (nivs-ivn);
9270           for (; ivn < nivs; ++ivn)
9271             {
9272               SLP_TREE_VEC_STMTS (slp_node)
9273                 .quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]);
9274               vec_steps.quick_push (vec_steps[0]);
9275             }
9276         }
9277
9278       /* Re-use IVs when we can.  We are generating further vector
9279          stmts by adding VF' * stride to the IVs generated above.  */
9280       if (ivn < nvects)
9281         {
9282           unsigned vfp
9283             = least_common_multiple (group_size, const_nunits) / group_size;
9284           tree lupdate_mul
9285             = build_vector_from_val (step_vectype,
9286                                      SCALAR_FLOAT_TYPE_P (stept)
9287                                      ? build_real_from_wide (stept,
9288                                                              vfp, UNSIGNED)
9289                                      : build_int_cstu (stept, vfp));
9290           for (; ivn < nvects; ++ivn)
9291             {
9292               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
9293               tree def = gimple_get_lhs (iv);
9294               if (ivn < 2*nivs)
9295                 vec_steps[ivn - nivs]
9296                   = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9297                                   vec_steps[ivn - nivs], lupdate_mul);
9298               gimple_seq stmts = NULL;
9299               def = gimple_convert (&stmts, step_vectype, def);
9300               def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
9301                                   def, vec_steps[ivn % nivs]);
9302               def = gimple_convert (&stmts, vectype, def);
9303               if (gimple_code (iv) == GIMPLE_PHI)
9304                 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9305               else
9306                 {
9307                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
9308                   gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
9309                 }
9310               SLP_TREE_VEC_STMTS (slp_node)
9311                 .quick_push (SSA_NAME_DEF_STMT (def));
9312             }
9313         }
9314
9315       new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
9316       gcc_assert (!new_bb);
9317
9318       return true;
9319     }
9320
9321   init_expr = vect_phi_initial_value (phi);
9322
9323   gimple_seq stmts = NULL;
9324   if (!nested_in_vect_loop)
9325     {
9326       /* Convert the initial value to the IV update type.  */
9327       tree new_type = TREE_TYPE (step_expr);
9328       init_expr = gimple_convert (&stmts, new_type, init_expr);
9329
9330       /* If we are using the loop mask to "peel" for alignment then we need
9331          to adjust the start value here.  */
9332       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9333       if (skip_niters != NULL_TREE)
9334         {
9335           if (FLOAT_TYPE_P (vectype))
9336             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
9337                                         skip_niters);
9338           else
9339             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
9340           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
9341                                          skip_niters, step_expr);
9342           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
9343                                     init_expr, skip_step);
9344         }
9345     }
9346
9347   if (stmts)
9348     {
9349       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9350       gcc_assert (!new_bb);
9351     }
9352
9353   /* Create the vector that holds the initial_value of the induction.  */
9354   if (nested_in_vect_loop)
9355     {
9356       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
9357          been created during vectorization of previous stmts.  We obtain it
9358          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
9359       auto_vec<tree> vec_inits;
9360       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
9361                                      init_expr, &vec_inits);
9362       vec_init = vec_inits[0];
9363       /* If the initial value is not of proper type, convert it.  */
9364       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
9365         {
9366           new_stmt
9367             = gimple_build_assign (vect_get_new_ssa_name (vectype,
9368                                                           vect_simple_var,
9369                                                           "vec_iv_"),
9370                                    VIEW_CONVERT_EXPR,
9371                                    build1 (VIEW_CONVERT_EXPR, vectype,
9372                                            vec_init));
9373           vec_init = gimple_assign_lhs (new_stmt);
9374           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
9375                                                  new_stmt);
9376           gcc_assert (!new_bb);
9377         }
9378     }
9379   else
9380     {
9381       /* iv_loop is the loop to be vectorized. Create:
9382          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
9383       stmts = NULL;
9384       new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
9385
9386       unsigned HOST_WIDE_INT const_nunits;
9387       if (nunits.is_constant (&const_nunits))
9388         {
9389           tree_vector_builder elts (step_vectype, const_nunits, 1);
9390           elts.quick_push (new_name);
9391           for (i = 1; i < const_nunits; i++)
9392             {
9393               /* Create: new_name_i = new_name + step_expr  */
9394               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
9395                                        new_name, step_expr);
9396               elts.quick_push (new_name);
9397             }
9398           /* Create a vector from [new_name_0, new_name_1, ...,
9399              new_name_nunits-1]  */
9400           vec_init = gimple_build_vector (&stmts, &elts);
9401         }
9402       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
9403         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
9404         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
9405                                  new_name, step_expr);
9406       else
9407         {
9408           /* Build:
9409                 [base, base, base, ...]
9410                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
9411           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
9412           gcc_assert (flag_associative_math);
9413           tree index = build_index_vector (step_vectype, 0, 1);
9414           tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
9415                                                         new_name);
9416           tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
9417                                                         step_expr);
9418           vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
9419           vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
9420                                    vec_init, step_vec);
9421           vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
9422                                    vec_init, base_vec);
9423         }
9424       vec_init = gimple_convert (&stmts, vectype, vec_init);
9425
9426       if (stmts)
9427         {
9428           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9429           gcc_assert (!new_bb);
9430         }
9431     }
9432
9433
9434   /* Create the vector that holds the step of the induction.  */
9435   if (nested_in_vect_loop)
9436     /* iv_loop is nested in the loop to be vectorized. Generate:
9437        vec_step = [S, S, S, S]  */
9438     new_name = step_expr;
9439   else
9440     {
9441       /* iv_loop is the loop to be vectorized. Generate:
9442           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
9443       gimple_seq seq = NULL;
9444       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
9445         {
9446           expr = build_int_cst (integer_type_node, vf);
9447           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
9448         }
9449       else
9450         expr = build_int_cst (TREE_TYPE (step_expr), vf);
9451       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
9452                                expr, step_expr);
9453       if (seq)
9454         {
9455           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
9456           gcc_assert (!new_bb);
9457         }
9458     }
9459
9460   t = unshare_expr (new_name);
9461   gcc_assert (CONSTANT_CLASS_P (new_name)
9462               || TREE_CODE (new_name) == SSA_NAME);
9463   new_vec = build_vector_from_val (step_vectype, t);
9464   vec_step = vect_init_vector (loop_vinfo, stmt_info,
9465                                new_vec, step_vectype, NULL);
9466
9467
9468   /* Create the following def-use cycle:
9469      loop prolog:
9470          vec_init = ...
9471          vec_step = ...
9472      loop:
9473          vec_iv = PHI <vec_init, vec_loop>
9474          ...
9475          STMT
9476          ...
9477          vec_loop = vec_iv + vec_step;  */
9478
9479   /* Create the induction-phi that defines the induction-operand.  */
9480   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
9481   induction_phi = create_phi_node (vec_dest, iv_loop->header);
9482   induc_def = PHI_RESULT (induction_phi);
9483
9484   /* Create the iv update inside the loop  */
9485   stmts = NULL;
9486   vec_def = gimple_convert (&stmts, step_vectype, induc_def);
9487   vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
9488   vec_def = gimple_convert (&stmts, vectype, vec_def);
9489   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9490   new_stmt = SSA_NAME_DEF_STMT (vec_def);
9491
9492   /* Set the arguments of the phi node:  */
9493   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9494   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9495                UNKNOWN_LOCATION);
9496
9497   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
9498   *vec_stmt = induction_phi;
9499
9500   /* In case that vectorization factor (VF) is bigger than the number
9501      of elements that we can fit in a vectype (nunits), we have to generate
9502      more than one vector stmt - i.e - we need to "unroll" the
9503      vector stmt by a factor VF/nunits.  For more details see documentation
9504      in vectorizable_operation.  */
9505
9506   if (ncopies > 1)
9507     {
9508       gimple_seq seq = NULL;
9509       /* FORNOW. This restriction should be relaxed.  */
9510       gcc_assert (!nested_in_vect_loop);
9511
9512       /* Create the vector that holds the step of the induction.  */
9513       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
9514         {
9515           expr = build_int_cst (integer_type_node, nunits);
9516           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
9517         }
9518       else
9519         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
9520       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
9521                                expr, step_expr);
9522       if (seq)
9523         {
9524           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
9525           gcc_assert (!new_bb);
9526         }
9527
9528       t = unshare_expr (new_name);
9529       gcc_assert (CONSTANT_CLASS_P (new_name)
9530                   || TREE_CODE (new_name) == SSA_NAME);
9531       new_vec = build_vector_from_val (step_vectype, t);
9532       vec_step = vect_init_vector (loop_vinfo, stmt_info,
9533                                    new_vec, step_vectype, NULL);
9534
9535       vec_def = induc_def;
9536       for (i = 1; i < ncopies; i++)
9537         {
9538           /* vec_i = vec_prev + vec_step  */
9539           gimple_seq stmts = NULL;
9540           vec_def = gimple_convert (&stmts, step_vectype, vec_def);
9541           vec_def = gimple_build (&stmts,
9542                                   PLUS_EXPR, step_vectype, vec_def, vec_step);
9543           vec_def = gimple_convert (&stmts, vectype, vec_def);
9544
9545           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9546           new_stmt = SSA_NAME_DEF_STMT (vec_def);
9547           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
9548         }
9549     }
9550
9551   if (dump_enabled_p ())
9552     dump_printf_loc (MSG_NOTE, vect_location,
9553                      "transform induction: created def-use cycle: %G%G",
9554                      (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
9555
9556   return true;
9557 }
9558
9559 /* Function vectorizable_live_operation.
9560
9561    STMT_INFO computes a value that is used outside the loop.  Check if
9562    it can be supported.  */
9563
9564 bool
9565 vectorizable_live_operation (vec_info *vinfo,
9566                              stmt_vec_info stmt_info,
9567                              gimple_stmt_iterator *gsi,
9568                              slp_tree slp_node, slp_instance slp_node_instance,
9569                              int slp_index, bool vec_stmt_p,
9570                              stmt_vector_for_cost *cost_vec)
9571 {
9572   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
9573   imm_use_iterator imm_iter;
9574   tree lhs, lhs_type, bitsize;
9575   tree vectype = (slp_node
9576                   ? SLP_TREE_VECTYPE (slp_node)
9577                   : STMT_VINFO_VECTYPE (stmt_info));
9578   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9579   int ncopies;
9580   gimple *use_stmt;
9581   auto_vec<tree> vec_oprnds;
9582   int vec_entry = 0;
9583   poly_uint64 vec_index = 0;
9584
9585   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
9586
9587   /* If a stmt of a reduction is live, vectorize it via
9588      vect_create_epilog_for_reduction.  vectorizable_reduction assessed
9589      validity so just trigger the transform here.  */
9590   if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
9591     {
9592       if (!vec_stmt_p)
9593         return true;
9594       if (slp_node)
9595         {
9596           /* For reduction chains the meta-info is attached to
9597              the group leader.  */
9598           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
9599             stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
9600           /* For SLP reductions we vectorize the epilogue for
9601              all involved stmts together.  */
9602           else if (slp_index != 0)
9603             return true;
9604           else
9605             /* For SLP reductions the meta-info is attached to
9606                the representative.  */
9607             stmt_info = SLP_TREE_REPRESENTATIVE (slp_node);
9608         }
9609       stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
9610       gcc_assert (reduc_info->is_reduc_info);
9611       if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
9612           || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
9613         return true;
9614       vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
9615                                         slp_node_instance);
9616       return true;
9617     }
9618
9619   /* If STMT is not relevant and it is a simple assignment and its inputs are
9620      invariant then it can remain in place, unvectorized.  The original last
9621      scalar value that it computes will be used.  */
9622   if (!STMT_VINFO_RELEVANT_P (stmt_info))
9623     {
9624       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
9625       if (dump_enabled_p ())
9626         dump_printf_loc (MSG_NOTE, vect_location,
9627                          "statement is simple and uses invariant.  Leaving in "
9628                          "place.\n");
9629       return true;
9630     }
9631
9632   if (slp_node)
9633     ncopies = 1;
9634   else
9635     ncopies = vect_get_num_copies (loop_vinfo, vectype);
9636
9637   if (slp_node)
9638     {
9639       gcc_assert (slp_index >= 0);
9640
9641       /* Get the last occurrence of the scalar index from the concatenation of
9642          all the slp vectors. Calculate which slp vector it is and the index
9643          within.  */
9644       int num_scalar = SLP_TREE_LANES (slp_node);
9645       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
9646       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
9647
9648       /* Calculate which vector contains the result, and which lane of
9649          that vector we need.  */
9650       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
9651         {
9652           if (dump_enabled_p ())
9653             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9654                              "Cannot determine which vector holds the"
9655                              " final result.\n");
9656           return false;
9657         }
9658     }
9659
9660   if (!vec_stmt_p)
9661     {
9662       /* No transformation required.  */
9663       if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
9664         {
9665           if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
9666                                                OPTIMIZE_FOR_SPEED))
9667             {
9668               if (dump_enabled_p ())
9669                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9670                                  "can't operate on partial vectors "
9671                                  "because the target doesn't support extract "
9672                                  "last reduction.\n");
9673               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
9674             }
9675           else if (slp_node)
9676             {
9677               if (dump_enabled_p ())
9678                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9679                                  "can't operate on partial vectors "
9680                                  "because an SLP statement is live after "
9681                                  "the loop.\n");
9682               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
9683             }
9684           else if (ncopies > 1)
9685             {
9686               if (dump_enabled_p ())
9687                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9688                                  "can't operate on partial vectors "
9689                                  "because ncopies is greater than 1.\n");
9690               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
9691             }
9692           else
9693             {
9694               gcc_assert (ncopies == 1 && !slp_node);
9695               vect_record_loop_mask (loop_vinfo,
9696                                      &LOOP_VINFO_MASKS (loop_vinfo),
9697                                      1, vectype, NULL);
9698             }
9699         }
9700       /* ???  Enable for loop costing as well.  */
9701       if (!loop_vinfo)
9702         record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
9703                           0, vect_epilogue);
9704       return true;
9705     }
9706
9707   /* Use the lhs of the original scalar statement.  */
9708   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
9709   if (dump_enabled_p ())
9710     dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
9711                      "stmt %G", stmt);
9712
9713   lhs = gimple_get_lhs (stmt);
9714   lhs_type = TREE_TYPE (lhs);
9715
9716   bitsize = vector_element_bits_tree (vectype);
9717
9718   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
9719   tree vec_lhs, bitstart;
9720   gimple *vec_stmt;
9721   if (slp_node)
9722     {
9723       gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
9724
9725       /* Get the correct slp vectorized stmt.  */
9726       vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
9727       vec_lhs = gimple_get_lhs (vec_stmt);
9728
9729       /* Get entry to use.  */
9730       bitstart = bitsize_int (vec_index);
9731       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
9732     }
9733   else
9734     {
9735       /* For multiple copies, get the last copy.  */
9736       vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
9737       vec_lhs = gimple_get_lhs (vec_stmt);
9738
9739       /* Get the last lane in the vector.  */
9740       bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
9741     }
9742
9743   if (loop_vinfo)
9744     {
9745       /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
9746          requirement, insert one phi node for it.  It looks like:
9747            loop;
9748          BB:
9749            # lhs' = PHI <lhs>
9750          ==>
9751            loop;
9752          BB:
9753            # vec_lhs' = PHI <vec_lhs>
9754            new_tree = lane_extract <vec_lhs', ...>;
9755            lhs' = new_tree;  */
9756
9757       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9758       basic_block exit_bb = single_exit (loop)->dest;
9759       gcc_assert (single_pred_p (exit_bb));
9760
9761       tree vec_lhs_phi = copy_ssa_name (vec_lhs);
9762       gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
9763       SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
9764
9765       gimple_seq stmts = NULL;
9766       tree new_tree;
9767       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
9768         {
9769           /* Emit:
9770
9771                SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
9772
9773              where VEC_LHS is the vectorized live-out result and MASK is
9774              the loop mask for the final iteration.  */
9775           gcc_assert (ncopies == 1 && !slp_node);
9776           tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
9777           tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
9778                                           1, vectype, 0);
9779           tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
9780                                           mask, vec_lhs_phi);
9781
9782           /* Convert the extracted vector element to the scalar type.  */
9783           new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
9784         }
9785       else
9786         {
9787           tree bftype = TREE_TYPE (vectype);
9788           if (VECTOR_BOOLEAN_TYPE_P (vectype))
9789             bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
9790           new_tree = build3 (BIT_FIELD_REF, bftype,
9791                              vec_lhs_phi, bitsize, bitstart);
9792           new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
9793                                            &stmts, true, NULL_TREE);
9794         }
9795
9796       if (stmts)
9797         {
9798           gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
9799           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
9800
9801           /* Remove existing phi from lhs and create one copy from new_tree.  */
9802           tree lhs_phi = NULL_TREE;
9803           gimple_stmt_iterator gsi;
9804           for (gsi = gsi_start_phis (exit_bb);
9805                !gsi_end_p (gsi); gsi_next (&gsi))
9806             {
9807               gimple *phi = gsi_stmt (gsi);
9808               if ((gimple_phi_arg_def (phi, 0) == lhs))
9809                 {
9810                   remove_phi_node (&gsi, false);
9811                   lhs_phi = gimple_phi_result (phi);
9812                   gimple *copy = gimple_build_assign (lhs_phi, new_tree);
9813                   gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
9814                   break;
9815                 }
9816             }
9817         }
9818
9819       /* Replace use of lhs with newly computed result.  If the use stmt is a
9820          single arg PHI, just replace all uses of PHI result.  It's necessary
9821          because lcssa PHI defining lhs may be before newly inserted stmt.  */
9822       use_operand_p use_p;
9823       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
9824         if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
9825             && !is_gimple_debug (use_stmt))
9826           {
9827             if (gimple_code (use_stmt) == GIMPLE_PHI
9828                 && gimple_phi_num_args (use_stmt) == 1)
9829               {
9830                 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
9831               }
9832             else
9833               {
9834                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
9835                     SET_USE (use_p, new_tree);
9836               }
9837             update_stmt (use_stmt);
9838           }
9839     }
9840   else
9841     {
9842       /* For basic-block vectorization simply insert the lane-extraction.  */
9843       tree bftype = TREE_TYPE (vectype);
9844       if (VECTOR_BOOLEAN_TYPE_P (vectype))
9845         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
9846       tree new_tree = build3 (BIT_FIELD_REF, bftype,
9847                               vec_lhs, bitsize, bitstart);
9848       gimple_seq stmts = NULL;
9849       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
9850                                        &stmts, true, NULL_TREE);
9851       if (TREE_CODE (new_tree) == SSA_NAME
9852           && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
9853         SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
9854       if (is_a <gphi *> (vec_stmt))
9855         {
9856           gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
9857           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9858         }
9859       else
9860         {
9861           gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
9862           gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
9863         }
9864
9865       /* Replace use of lhs with newly computed result.  If the use stmt is a
9866          single arg PHI, just replace all uses of PHI result.  It's necessary
9867          because lcssa PHI defining lhs may be before newly inserted stmt.  */
9868       use_operand_p use_p;
9869       stmt_vec_info use_stmt_info;
9870       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
9871         if (!is_gimple_debug (use_stmt)
9872             && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
9873                 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
9874           {
9875             /* ???  This can happen when the live lane ends up being
9876                used in a vector construction code-generated by an
9877                external SLP node (and code-generation for that already
9878                happened).  See gcc.dg/vect/bb-slp-47.c.
9879                Doing this is what would happen if that vector CTOR
9880                were not code-generated yet so it is not too bad.
9881                ???  In fact we'd likely want to avoid this situation
9882                in the first place.  */
9883             if (TREE_CODE (new_tree) == SSA_NAME
9884                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
9885                 && gimple_code (use_stmt) != GIMPLE_PHI
9886                 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
9887                                                 use_stmt))
9888               {
9889                 enum tree_code code = gimple_assign_rhs_code (use_stmt);
9890                 gcc_assert (code == CONSTRUCTOR
9891                             || code == VIEW_CONVERT_EXPR
9892                             || CONVERT_EXPR_CODE_P (code));
9893                 if (dump_enabled_p ())
9894                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9895                                    "Using original scalar computation for "
9896                                    "live lane because use preceeds vector "
9897                                    "def\n");
9898                 continue;
9899               }
9900             /* ???  It can also happen that we end up pulling a def into
9901                a loop where replacing out-of-loop uses would require
9902                a new LC SSA PHI node.  Retain the original scalar in
9903                those cases as well.  PR98064.  */
9904             if (TREE_CODE (new_tree) == SSA_NAME
9905                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
9906                 && (gimple_bb (use_stmt)->loop_father
9907                     != gimple_bb (vec_stmt)->loop_father)
9908                 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
9909                                         gimple_bb (use_stmt)->loop_father))
9910               {
9911                 if (dump_enabled_p ())
9912                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9913                                    "Using original scalar computation for "
9914                                    "live lane because there is an out-of-loop "
9915                                    "definition for it\n");
9916                 continue;
9917               }
9918             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
9919               SET_USE (use_p, new_tree);
9920             update_stmt (use_stmt);
9921           }
9922     }
9923
9924   return true;
9925 }
9926
9927 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
9928
9929 static void
9930 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
9931 {
9932   ssa_op_iter op_iter;
9933   imm_use_iterator imm_iter;
9934   def_operand_p def_p;
9935   gimple *ustmt;
9936
9937   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
9938     {
9939       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
9940         {
9941           basic_block bb;
9942
9943           if (!is_gimple_debug (ustmt))
9944             continue;
9945
9946           bb = gimple_bb (ustmt);
9947
9948           if (!flow_bb_inside_loop_p (loop, bb))
9949             {
9950               if (gimple_debug_bind_p (ustmt))
9951                 {
9952                   if (dump_enabled_p ())
9953                     dump_printf_loc (MSG_NOTE, vect_location,
9954                                      "killing debug use\n");
9955
9956                   gimple_debug_bind_reset_value (ustmt);
9957                   update_stmt (ustmt);
9958                 }
9959               else
9960                 gcc_unreachable ();
9961             }
9962         }
9963     }
9964 }
9965
9966 /* Given loop represented by LOOP_VINFO, return true if computation of
9967    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
9968    otherwise.  */
9969
9970 static bool
9971 loop_niters_no_overflow (loop_vec_info loop_vinfo)
9972 {
9973   /* Constant case.  */
9974   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
9975     {
9976       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
9977       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
9978
9979       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
9980       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
9981       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
9982         return true;
9983     }
9984
9985   widest_int max;
9986   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9987   /* Check the upper bound of loop niters.  */
9988   if (get_max_loop_iterations (loop, &max))
9989     {
9990       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
9991       signop sgn = TYPE_SIGN (type);
9992       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
9993       if (max < type_max)
9994         return true;
9995     }
9996   return false;
9997 }
9998
9999 /* Return a mask type with half the number of elements as OLD_TYPE,
10000    given that it should have mode NEW_MODE.  */
10001
10002 tree
10003 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
10004 {
10005   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
10006   return build_truth_vector_type_for_mode (nunits, new_mode);
10007 }
10008
10009 /* Return a mask type with twice as many elements as OLD_TYPE,
10010    given that it should have mode NEW_MODE.  */
10011
10012 tree
10013 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
10014 {
10015   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
10016   return build_truth_vector_type_for_mode (nunits, new_mode);
10017 }
10018
10019 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
10020    contain a sequence of NVECTORS masks that each control a vector of type
10021    VECTYPE.  If SCALAR_MASK is nonnull, the fully-masked loop would AND
10022    these vector masks with the vector version of SCALAR_MASK.  */
10023
10024 void
10025 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
10026                        unsigned int nvectors, tree vectype, tree scalar_mask)
10027 {
10028   gcc_assert (nvectors != 0);
10029   if (masks->length () < nvectors)
10030     masks->safe_grow_cleared (nvectors, true);
10031   rgroup_controls *rgm = &(*masks)[nvectors - 1];
10032   /* The number of scalars per iteration and the number of vectors are
10033      both compile-time constants.  */
10034   unsigned int nscalars_per_iter
10035     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10036                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
10037
10038   if (scalar_mask)
10039     {
10040       scalar_cond_masked_key cond (scalar_mask, nvectors);
10041       loop_vinfo->scalar_cond_masked_set.add (cond);
10042     }
10043
10044   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
10045     {
10046       rgm->max_nscalars_per_iter = nscalars_per_iter;
10047       rgm->type = truth_type_for (vectype);
10048       rgm->factor = 1;
10049     }
10050 }
10051
10052 /* Given a complete set of masks MASKS, extract mask number INDEX
10053    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
10054    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
10055
10056    See the comment above vec_loop_masks for more details about the mask
10057    arrangement.  */
10058
10059 tree
10060 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
10061                     unsigned int nvectors, tree vectype, unsigned int index)
10062 {
10063   rgroup_controls *rgm = &(*masks)[nvectors - 1];
10064   tree mask_type = rgm->type;
10065
10066   /* Populate the rgroup's mask array, if this is the first time we've
10067      used it.  */
10068   if (rgm->controls.is_empty ())
10069     {
10070       rgm->controls.safe_grow_cleared (nvectors, true);
10071       for (unsigned int i = 0; i < nvectors; ++i)
10072         {
10073           tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
10074           /* Provide a dummy definition until the real one is available.  */
10075           SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
10076           rgm->controls[i] = mask;
10077         }
10078     }
10079
10080   tree mask = rgm->controls[index];
10081   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
10082                 TYPE_VECTOR_SUBPARTS (vectype)))
10083     {
10084       /* A loop mask for data type X can be reused for data type Y
10085          if X has N times more elements than Y and if Y's elements
10086          are N times bigger than X's.  In this case each sequence
10087          of N elements in the loop mask will be all-zero or all-one.
10088          We can then view-convert the mask so that each sequence of
10089          N elements is replaced by a single element.  */
10090       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
10091                               TYPE_VECTOR_SUBPARTS (vectype)));
10092       gimple_seq seq = NULL;
10093       mask_type = truth_type_for (vectype);
10094       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
10095       if (seq)
10096         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10097     }
10098   return mask;
10099 }
10100
10101 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
10102    lengths for controlling an operation on VECTYPE.  The operation splits
10103    each element of VECTYPE into FACTOR separate subelements, measuring the
10104    length as a number of these subelements.  */
10105
10106 void
10107 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
10108                       unsigned int nvectors, tree vectype, unsigned int factor)
10109 {
10110   gcc_assert (nvectors != 0);
10111   if (lens->length () < nvectors)
10112     lens->safe_grow_cleared (nvectors, true);
10113   rgroup_controls *rgl = &(*lens)[nvectors - 1];
10114
10115   /* The number of scalars per iteration, scalar occupied bytes and
10116      the number of vectors are both compile-time constants.  */
10117   unsigned int nscalars_per_iter
10118     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10119                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
10120
10121   if (rgl->max_nscalars_per_iter < nscalars_per_iter)
10122     {
10123       /* For now, we only support cases in which all loads and stores fall back
10124          to VnQI or none do.  */
10125       gcc_assert (!rgl->max_nscalars_per_iter
10126                   || (rgl->factor == 1 && factor == 1)
10127                   || (rgl->max_nscalars_per_iter * rgl->factor
10128                       == nscalars_per_iter * factor));
10129       rgl->max_nscalars_per_iter = nscalars_per_iter;
10130       rgl->type = vectype;
10131       rgl->factor = factor;
10132     }
10133 }
10134
10135 /* Given a complete set of length LENS, extract length number INDEX for an
10136    rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS.  */
10137
10138 tree
10139 vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
10140                    unsigned int nvectors, unsigned int index)
10141 {
10142   rgroup_controls *rgl = &(*lens)[nvectors - 1];
10143   bool use_bias_adjusted_len =
10144     LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
10145
10146   /* Populate the rgroup's len array, if this is the first time we've
10147      used it.  */
10148   if (rgl->controls.is_empty ())
10149     {
10150       rgl->controls.safe_grow_cleared (nvectors, true);
10151       for (unsigned int i = 0; i < nvectors; ++i)
10152         {
10153           tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
10154           gcc_assert (len_type != NULL_TREE);
10155
10156           tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
10157
10158           /* Provide a dummy definition until the real one is available.  */
10159           SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
10160           rgl->controls[i] = len;
10161
10162           if (use_bias_adjusted_len)
10163             {
10164               gcc_assert (i == 0);
10165               tree adjusted_len =
10166                 make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
10167               SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
10168               rgl->bias_adjusted_ctrl = adjusted_len;
10169             }
10170         }
10171     }
10172
10173   if (use_bias_adjusted_len)
10174     return rgl->bias_adjusted_ctrl;
10175   else
10176     return rgl->controls[index];
10177 }
10178
10179 /* Scale profiling counters by estimation for LOOP which is vectorized
10180    by factor VF.  */
10181
10182 static void
10183 scale_profile_for_vect_loop (class loop *loop, unsigned vf)
10184 {
10185   edge preheader = loop_preheader_edge (loop);
10186   /* Reduce loop iterations by the vectorization factor.  */
10187   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
10188   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
10189
10190   if (freq_h.nonzero_p ())
10191     {
10192       profile_probability p;
10193
10194       /* Avoid dropping loop body profile counter to 0 because of zero count
10195          in loop's preheader.  */
10196       if (!(freq_e == profile_count::zero ()))
10197         freq_e = freq_e.force_nonzero ();
10198       p = (freq_e * (new_est_niter + 1)).probability_in (freq_h);
10199       scale_loop_frequencies (loop, p);
10200     }
10201
10202   edge exit_e = single_exit (loop);
10203   exit_e->probability = profile_probability::always () / (new_est_niter + 1);
10204
10205   edge exit_l = single_pred_edge (loop->latch);
10206   profile_probability prob = exit_l->probability;
10207   exit_l->probability = exit_e->probability.invert ();
10208   if (prob.initialized_p () && exit_l->probability.initialized_p ())
10209     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
10210 }
10211
10212 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
10213    latch edge values originally defined by it.  */
10214
10215 static void
10216 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
10217                                      stmt_vec_info def_stmt_info)
10218 {
10219   tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
10220   if (!def || TREE_CODE (def) != SSA_NAME)
10221     return;
10222   stmt_vec_info phi_info;
10223   imm_use_iterator iter;
10224   use_operand_p use_p;
10225   FOR_EACH_IMM_USE_FAST (use_p, iter, def)
10226     if (gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p)))
10227       if (gimple_bb (phi)->loop_father->header == gimple_bb (phi)
10228           && (phi_info = loop_vinfo->lookup_stmt (phi))
10229           && STMT_VINFO_RELEVANT_P (phi_info)
10230           && VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
10231           && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
10232           && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
10233         {
10234           loop_p loop = gimple_bb (phi)->loop_father;
10235           edge e = loop_latch_edge (loop);
10236           if (PHI_ARG_DEF_FROM_EDGE (phi, e) == def)
10237             {
10238               vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
10239               vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
10240               gcc_assert (phi_defs.length () == latch_defs.length ());
10241               for (unsigned i = 0; i < phi_defs.length (); ++i)
10242                 add_phi_arg (as_a <gphi *> (phi_defs[i]),
10243                              gimple_get_lhs (latch_defs[i]), e,
10244                              gimple_phi_arg_location (phi, e->dest_idx));
10245             }
10246         }
10247 }
10248
10249 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
10250    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
10251    stmt_vec_info.  */
10252
10253 static bool
10254 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
10255                           gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
10256 {
10257   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10258   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
10259
10260   if (dump_enabled_p ())
10261     dump_printf_loc (MSG_NOTE, vect_location,
10262                      "------>vectorizing statement: %G", stmt_info->stmt);
10263
10264   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
10265     vect_loop_kill_debug_uses (loop, stmt_info);
10266
10267   if (!STMT_VINFO_RELEVANT_P (stmt_info)
10268       && !STMT_VINFO_LIVE_P (stmt_info))
10269     return false;
10270
10271   if (STMT_VINFO_VECTYPE (stmt_info))
10272     {
10273       poly_uint64 nunits
10274         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
10275       if (!STMT_SLP_TYPE (stmt_info)
10276           && maybe_ne (nunits, vf)
10277           && dump_enabled_p ())
10278         /* For SLP VF is set according to unrolling factor, and not
10279            to vector size, hence for SLP this print is not valid.  */
10280         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
10281     }
10282
10283   /* Pure SLP statements have already been vectorized.  We still need
10284      to apply loop vectorization to hybrid SLP statements.  */
10285   if (PURE_SLP_STMT (stmt_info))
10286     return false;
10287
10288   if (dump_enabled_p ())
10289     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
10290
10291   if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
10292     *seen_store = stmt_info;
10293
10294   return true;
10295 }
10296
10297 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
10298    in the hash_map with its corresponding values.  */
10299
10300 static tree
10301 find_in_mapping (tree t, void *context)
10302 {
10303   hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
10304
10305   tree *value = mapping->get (t);
10306   return value ? *value : t;
10307 }
10308
10309 /* Update EPILOGUE's loop_vec_info.  EPILOGUE was constructed as a copy of the
10310    original loop that has now been vectorized.
10311
10312    The inits of the data_references need to be advanced with the number of
10313    iterations of the main loop.  This has been computed in vect_do_peeling and
10314    is stored in parameter ADVANCE.  We first restore the data_references
10315    initial offset with the values recored in ORIG_DRS_INIT.
10316
10317    Since the loop_vec_info of this EPILOGUE was constructed for the original
10318    loop, its stmt_vec_infos all point to the original statements.  These need
10319    to be updated to point to their corresponding copies as well as the SSA_NAMES
10320    in their PATTERN_DEF_SEQs and RELATED_STMTs.
10321
10322    The data_reference's connections also need to be updated.  Their
10323    corresponding dr_vec_info need to be reconnected to the EPILOGUE's
10324    stmt_vec_infos, their statements need to point to their corresponding copy,
10325    if they are gather loads or scatter stores then their reference needs to be
10326    updated to point to its corresponding copy and finally we set
10327    'base_misaligned' to false as we have already peeled for alignment in the
10328    prologue of the main loop.  */
10329
10330 static void
10331 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
10332 {
10333   loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
10334   auto_vec<gimple *> stmt_worklist;
10335   hash_map<tree,tree> mapping;
10336   gimple *orig_stmt, *new_stmt;
10337   gimple_stmt_iterator epilogue_gsi;
10338   gphi_iterator epilogue_phi_gsi;
10339   stmt_vec_info stmt_vinfo = NULL, related_vinfo;
10340   basic_block *epilogue_bbs = get_loop_body (epilogue);
10341   unsigned i;
10342
10343   free (LOOP_VINFO_BBS (epilogue_vinfo));
10344   LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
10345
10346   /* Advance data_reference's with the number of iterations of the previous
10347      loop and its prologue.  */
10348   vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
10349
10350
10351   /* The EPILOGUE loop is a copy of the original loop so they share the same
10352      gimple UIDs.  In this loop we update the loop_vec_info of the EPILOGUE to
10353      point to the copied statements.  We also create a mapping of all LHS' in
10354      the original loop and all the LHS' in the EPILOGUE and create worklists to
10355      update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs.  */
10356   for (unsigned i = 0; i < epilogue->num_nodes; ++i)
10357     {
10358       for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
10359            !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
10360         {
10361           new_stmt = epilogue_phi_gsi.phi ();
10362
10363           gcc_assert (gimple_uid (new_stmt) > 0);
10364           stmt_vinfo
10365             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
10366
10367           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
10368           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
10369
10370           mapping.put (gimple_phi_result (orig_stmt),
10371                        gimple_phi_result (new_stmt));
10372           /* PHI nodes can not have patterns or related statements.  */
10373           gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
10374                       && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
10375         }
10376
10377       for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
10378            !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
10379         {
10380           new_stmt = gsi_stmt (epilogue_gsi);
10381           if (is_gimple_debug (new_stmt))
10382             continue;
10383
10384           gcc_assert (gimple_uid (new_stmt) > 0);
10385           stmt_vinfo
10386             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
10387
10388           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
10389           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
10390
10391           if (tree old_lhs = gimple_get_lhs (orig_stmt))
10392             mapping.put (old_lhs, gimple_get_lhs (new_stmt));
10393
10394           if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
10395             {
10396               gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
10397               for (gimple_stmt_iterator gsi = gsi_start (seq);
10398                    !gsi_end_p (gsi); gsi_next (&gsi))
10399                 stmt_worklist.safe_push (gsi_stmt (gsi));
10400             }
10401
10402           related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
10403           if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
10404             {
10405               gimple *stmt = STMT_VINFO_STMT (related_vinfo);
10406               stmt_worklist.safe_push (stmt);
10407               /* Set BB such that the assert in
10408                 'get_initial_def_for_reduction' is able to determine that
10409                 the BB of the related stmt is inside this loop.  */
10410               gimple_set_bb (stmt,
10411                              gimple_bb (new_stmt));
10412               related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
10413               gcc_assert (related_vinfo == NULL
10414                           || related_vinfo == stmt_vinfo);
10415             }
10416         }
10417     }
10418
10419   /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
10420      using the original main loop and thus need to be updated to refer to the
10421      cloned variables used in the epilogue.  */
10422   for (unsigned i = 0; i < stmt_worklist.length (); ++i)
10423     {
10424       gimple *stmt = stmt_worklist[i];
10425       tree *new_op;
10426
10427       for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
10428         {
10429           tree op = gimple_op (stmt, j);
10430           if ((new_op = mapping.get(op)))
10431             gimple_set_op (stmt, j, *new_op);
10432           else
10433             {
10434               /* PR92429: The last argument of simplify_replace_tree disables
10435                  folding when replacing arguments.  This is required as
10436                  otherwise you might end up with different statements than the
10437                  ones analyzed in vect_loop_analyze, leading to different
10438                  vectorization.  */
10439               op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
10440                                           &find_in_mapping, &mapping, false);
10441               gimple_set_op (stmt, j, op);
10442             }
10443         }
10444     }
10445
10446   struct data_reference *dr;
10447   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
10448   FOR_EACH_VEC_ELT (datarefs, i, dr)
10449     {
10450       orig_stmt = DR_STMT (dr);
10451       gcc_assert (gimple_uid (orig_stmt) > 0);
10452       stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
10453       /* Data references for gather loads and scatter stores do not use the
10454          updated offset we set using ADVANCE.  Instead we have to make sure the
10455          reference in the data references point to the corresponding copy of
10456          the original in the epilogue.  */
10457       if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
10458           == VMAT_GATHER_SCATTER)
10459         {
10460           DR_REF (dr)
10461             = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
10462                                      &find_in_mapping, &mapping);
10463           DR_BASE_ADDRESS (dr)
10464             = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
10465                                      &find_in_mapping, &mapping);
10466         }
10467       DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
10468       stmt_vinfo->dr_aux.stmt = stmt_vinfo;
10469       /* The vector size of the epilogue is smaller than that of the main loop
10470          so the alignment is either the same or lower. This means the dr will
10471          thus by definition be aligned.  */
10472       STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
10473     }
10474
10475   epilogue_vinfo->shared->datarefs_copy.release ();
10476   epilogue_vinfo->shared->save_datarefs ();
10477 }
10478
10479 /* Function vect_transform_loop.
10480
10481    The analysis phase has determined that the loop is vectorizable.
10482    Vectorize the loop - created vectorized stmts to replace the scalar
10483    stmts in the loop, and update the loop exit condition.
10484    Returns scalar epilogue loop if any.  */
10485
10486 class loop *
10487 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
10488 {
10489   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10490   class loop *epilogue = NULL;
10491   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
10492   int nbbs = loop->num_nodes;
10493   int i;
10494   tree niters_vector = NULL_TREE;
10495   tree step_vector = NULL_TREE;
10496   tree niters_vector_mult_vf = NULL_TREE;
10497   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
10498   unsigned int lowest_vf = constant_lower_bound (vf);
10499   gimple *stmt;
10500   bool check_profitability = false;
10501   unsigned int th;
10502
10503   DUMP_VECT_SCOPE ("vec_transform_loop");
10504
10505   loop_vinfo->shared->check_datarefs ();
10506
10507   /* Use the more conservative vectorization threshold.  If the number
10508      of iterations is constant assume the cost check has been performed
10509      by our caller.  If the threshold makes all loops profitable that
10510      run at least the (estimated) vectorization factor number of times
10511      checking is pointless, too.  */
10512   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
10513   if (vect_apply_runtime_profitability_check_p (loop_vinfo))
10514     {
10515       if (dump_enabled_p ())
10516         dump_printf_loc (MSG_NOTE, vect_location,
10517                          "Profitability threshold is %d loop iterations.\n",
10518                          th);
10519       check_profitability = true;
10520     }
10521
10522   /* Make sure there exists a single-predecessor exit bb.  Do this before
10523      versioning.   */
10524   edge e = single_exit (loop);
10525   if (! single_pred_p (e->dest))
10526     {
10527       split_loop_exit_edge (e, true);
10528       if (dump_enabled_p ())
10529         dump_printf (MSG_NOTE, "split exit edge\n");
10530     }
10531
10532   /* Version the loop first, if required, so the profitability check
10533      comes first.  */
10534
10535   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
10536     {
10537       class loop *sloop
10538         = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
10539       sloop->force_vectorize = false;
10540       check_profitability = false;
10541     }
10542
10543   /* Make sure there exists a single-predecessor exit bb also on the
10544      scalar loop copy.  Do this after versioning but before peeling
10545      so CFG structure is fine for both scalar and if-converted loop
10546      to make slpeel_duplicate_current_defs_from_edges face matched
10547      loop closed PHI nodes on the exit.  */
10548   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
10549     {
10550       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
10551       if (! single_pred_p (e->dest))
10552         {
10553           split_loop_exit_edge (e, true);
10554           if (dump_enabled_p ())
10555             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
10556         }
10557     }
10558
10559   tree niters = vect_build_loop_niters (loop_vinfo);
10560   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
10561   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
10562   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
10563   tree advance;
10564   drs_init_vec orig_drs_init;
10565
10566   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
10567                               &step_vector, &niters_vector_mult_vf, th,
10568                               check_profitability, niters_no_overflow,
10569                               &advance);
10570
10571   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
10572       && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
10573     scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
10574                             LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
10575
10576   if (niters_vector == NULL_TREE)
10577     {
10578       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
10579           && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
10580           && known_eq (lowest_vf, vf))
10581         {
10582           niters_vector
10583             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
10584                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
10585           step_vector = build_one_cst (TREE_TYPE (niters));
10586         }
10587       else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
10588         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
10589                                      &step_vector, niters_no_overflow);
10590       else
10591         /* vect_do_peeling subtracted the number of peeled prologue
10592            iterations from LOOP_VINFO_NITERS.  */
10593         vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
10594                                      &niters_vector, &step_vector,
10595                                      niters_no_overflow);
10596     }
10597
10598   /* 1) Make sure the loop header has exactly two entries
10599      2) Make sure we have a preheader basic block.  */
10600
10601   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
10602
10603   split_edge (loop_preheader_edge (loop));
10604
10605   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
10606     /* This will deal with any possible peeling.  */
10607     vect_prepare_for_masked_peels (loop_vinfo);
10608
10609   /* Schedule the SLP instances first, then handle loop vectorization
10610      below.  */
10611   if (!loop_vinfo->slp_instances.is_empty ())
10612     {
10613       DUMP_VECT_SCOPE ("scheduling SLP instances");
10614       vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
10615     }
10616
10617   /* FORNOW: the vectorizer supports only loops which body consist
10618      of one basic block (header + empty latch). When the vectorizer will
10619      support more involved loop forms, the order by which the BBs are
10620      traversed need to be reconsidered.  */
10621
10622   for (i = 0; i < nbbs; i++)
10623     {
10624       basic_block bb = bbs[i];
10625       stmt_vec_info stmt_info;
10626
10627       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
10628            gsi_next (&si))
10629         {
10630           gphi *phi = si.phi ();
10631           if (dump_enabled_p ())
10632             dump_printf_loc (MSG_NOTE, vect_location,
10633                              "------>vectorizing phi: %G", (gimple *) phi);
10634           stmt_info = loop_vinfo->lookup_stmt (phi);
10635           if (!stmt_info)
10636             continue;
10637
10638           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
10639             vect_loop_kill_debug_uses (loop, stmt_info);
10640
10641           if (!STMT_VINFO_RELEVANT_P (stmt_info)
10642               && !STMT_VINFO_LIVE_P (stmt_info))
10643             continue;
10644
10645           if (STMT_VINFO_VECTYPE (stmt_info)
10646               && (maybe_ne
10647                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
10648               && dump_enabled_p ())
10649             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
10650
10651           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
10652                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
10653                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
10654                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
10655                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
10656               && ! PURE_SLP_STMT (stmt_info))
10657             {
10658               if (dump_enabled_p ())
10659                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
10660               vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
10661             }
10662         }
10663
10664       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
10665            gsi_next (&si))
10666         {
10667           gphi *phi = si.phi ();
10668           stmt_info = loop_vinfo->lookup_stmt (phi);
10669           if (!stmt_info)
10670             continue;
10671
10672           if (!STMT_VINFO_RELEVANT_P (stmt_info)
10673               && !STMT_VINFO_LIVE_P (stmt_info))
10674             continue;
10675
10676           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
10677                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
10678                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
10679                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
10680                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
10681               && ! PURE_SLP_STMT (stmt_info))
10682             maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
10683         }
10684
10685       for (gimple_stmt_iterator si = gsi_start_bb (bb);
10686            !gsi_end_p (si);)
10687         {
10688           stmt = gsi_stmt (si);
10689           /* During vectorization remove existing clobber stmts.  */
10690           if (gimple_clobber_p (stmt))
10691             {
10692               unlink_stmt_vdef (stmt);
10693               gsi_remove (&si, true);
10694               release_defs (stmt);
10695             }
10696           else
10697             {
10698               /* Ignore vector stmts created in the outer loop.  */
10699               stmt_info = loop_vinfo->lookup_stmt (stmt);
10700
10701               /* vector stmts created in the outer-loop during vectorization of
10702                  stmts in an inner-loop may not have a stmt_info, and do not
10703                  need to be vectorized.  */
10704               stmt_vec_info seen_store = NULL;
10705               if (stmt_info)
10706                 {
10707                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
10708                     {
10709                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
10710                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
10711                            !gsi_end_p (subsi); gsi_next (&subsi))
10712                         {
10713                           stmt_vec_info pat_stmt_info
10714                             = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
10715                           vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
10716                                                     &si, &seen_store);
10717                         }
10718                       stmt_vec_info pat_stmt_info
10719                         = STMT_VINFO_RELATED_STMT (stmt_info);
10720                       if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
10721                                                     &si, &seen_store))
10722                         maybe_set_vectorized_backedge_value (loop_vinfo,
10723                                                              pat_stmt_info);
10724                     }
10725                   else
10726                     {
10727                       if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
10728                                                     &seen_store))
10729                         maybe_set_vectorized_backedge_value (loop_vinfo,
10730                                                              stmt_info);
10731                     }
10732                 }
10733               gsi_next (&si);
10734               if (seen_store)
10735                 {
10736                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
10737                     /* Interleaving.  If IS_STORE is TRUE, the
10738                        vectorization of the interleaving chain was
10739                        completed - free all the stores in the chain.  */
10740                     vect_remove_stores (loop_vinfo,
10741                                         DR_GROUP_FIRST_ELEMENT (seen_store));
10742                   else
10743                     /* Free the attached stmt_vec_info and remove the stmt.  */
10744                     loop_vinfo->remove_stmt (stmt_info);
10745                 }
10746             }
10747         }
10748
10749       /* Stub out scalar statements that must not survive vectorization.
10750          Doing this here helps with grouped statements, or statements that
10751          are involved in patterns.  */
10752       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
10753            !gsi_end_p (gsi); gsi_next (&gsi))
10754         {
10755           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
10756           if (!call || !gimple_call_internal_p (call))
10757             continue;
10758           internal_fn ifn = gimple_call_internal_fn (call);
10759           if (ifn == IFN_MASK_LOAD)
10760             {
10761               tree lhs = gimple_get_lhs (call);
10762               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
10763                 {
10764                   tree zero = build_zero_cst (TREE_TYPE (lhs));
10765                   gimple *new_stmt = gimple_build_assign (lhs, zero);
10766                   gsi_replace (&gsi, new_stmt, true);
10767                 }
10768             }
10769           else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
10770             {
10771               tree lhs = gimple_get_lhs (call);
10772               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
10773                 {
10774                   tree else_arg
10775                     = gimple_call_arg (call, gimple_call_num_args (call) - 1);
10776                   gimple *new_stmt = gimple_build_assign (lhs, else_arg);
10777                   gsi_replace (&gsi, new_stmt, true);
10778                 }
10779             }
10780         }
10781     }                           /* BBs in loop */
10782
10783   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
10784      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
10785   if (integer_onep (step_vector))
10786     niters_no_overflow = true;
10787   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
10788                            niters_vector_mult_vf, !niters_no_overflow);
10789
10790   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
10791   scale_profile_for_vect_loop (loop, assumed_vf);
10792
10793   /* True if the final iteration might not handle a full vector's
10794      worth of scalar iterations.  */
10795   bool final_iter_may_be_partial
10796     = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
10797   /* The minimum number of iterations performed by the epilogue.  This
10798      is 1 when peeling for gaps because we always need a final scalar
10799      iteration.  */
10800   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
10801   /* +1 to convert latch counts to loop iteration counts,
10802      -min_epilogue_iters to remove iterations that cannot be performed
10803        by the vector code.  */
10804   int bias_for_lowest = 1 - min_epilogue_iters;
10805   int bias_for_assumed = bias_for_lowest;
10806   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
10807   if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
10808     {
10809       /* When the amount of peeling is known at compile time, the first
10810          iteration will have exactly alignment_npeels active elements.
10811          In the worst case it will have at least one.  */
10812       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
10813       bias_for_lowest += lowest_vf - min_first_active;
10814       bias_for_assumed += assumed_vf - min_first_active;
10815     }
10816   /* In these calculations the "- 1" converts loop iteration counts
10817      back to latch counts.  */
10818   if (loop->any_upper_bound)
10819     {
10820       loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
10821       loop->nb_iterations_upper_bound
10822         = (final_iter_may_be_partial
10823            ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
10824                             lowest_vf) - 1
10825            : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
10826                              lowest_vf) - 1);
10827       if (main_vinfo
10828           /* Both peeling for alignment and peeling for gaps can end up
10829              with the scalar epilogue running for more than VF-1 iterations.  */
10830           && !main_vinfo->peeling_for_alignment
10831           && !main_vinfo->peeling_for_gaps)
10832         {
10833           unsigned int bound;
10834           poly_uint64 main_iters
10835             = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
10836                            LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
10837           main_iters
10838             = upper_bound (main_iters,
10839                            LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
10840           if (can_div_away_from_zero_p (main_iters,
10841                                         LOOP_VINFO_VECT_FACTOR (loop_vinfo),
10842                                         &bound))
10843             loop->nb_iterations_upper_bound
10844               = wi::umin ((widest_int) (bound - 1),
10845                           loop->nb_iterations_upper_bound);
10846       }
10847   }
10848   if (loop->any_likely_upper_bound)
10849     loop->nb_iterations_likely_upper_bound
10850       = (final_iter_may_be_partial
10851          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
10852                           + bias_for_lowest, lowest_vf) - 1
10853          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
10854                            + bias_for_lowest, lowest_vf) - 1);
10855   if (loop->any_estimate)
10856     loop->nb_iterations_estimate
10857       = (final_iter_may_be_partial
10858          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
10859                           assumed_vf) - 1
10860          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
10861                            assumed_vf) - 1);
10862
10863   if (dump_enabled_p ())
10864     {
10865       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
10866         {
10867           dump_printf_loc (MSG_NOTE, vect_location,
10868                            "LOOP VECTORIZED\n");
10869           if (loop->inner)
10870             dump_printf_loc (MSG_NOTE, vect_location,
10871                              "OUTER LOOP VECTORIZED\n");
10872           dump_printf (MSG_NOTE, "\n");
10873         }
10874       else
10875         dump_printf_loc (MSG_NOTE, vect_location,
10876                          "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
10877                          GET_MODE_NAME (loop_vinfo->vector_mode));
10878     }
10879
10880   /* Loops vectorized with a variable factor won't benefit from
10881      unrolling/peeling.  */
10882   if (!vf.is_constant ())
10883     {
10884       loop->unroll = 1;
10885       if (dump_enabled_p ())
10886         dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
10887                          " variable-length vectorization factor\n");
10888     }
10889   /* Free SLP instances here because otherwise stmt reference counting
10890      won't work.  */
10891   slp_instance instance;
10892   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
10893     vect_free_slp_instance (instance);
10894   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
10895   /* Clear-up safelen field since its value is invalid after vectorization
10896      since vectorized loop can have loop-carried dependencies.  */
10897   loop->safelen = 0;
10898
10899   if (epilogue)
10900     {
10901       update_epilogue_loop_vinfo (epilogue, advance);
10902
10903       epilogue->simduid = loop->simduid;
10904       epilogue->force_vectorize = loop->force_vectorize;
10905       epilogue->dont_vectorize = false;
10906     }
10907
10908   return epilogue;
10909 }
10910
10911 /* The code below is trying to perform simple optimization - revert
10912    if-conversion for masked stores, i.e. if the mask of a store is zero
10913    do not perform it and all stored value producers also if possible.
10914    For example,
10915      for (i=0; i<n; i++)
10916        if (c[i])
10917         {
10918           p1[i] += 1;
10919           p2[i] = p3[i] +2;
10920         }
10921    this transformation will produce the following semi-hammock:
10922
10923    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
10924      {
10925        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
10926        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
10927        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
10928        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
10929        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
10930        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
10931      }
10932 */
10933
10934 void
10935 optimize_mask_stores (class loop *loop)
10936 {
10937   basic_block *bbs = get_loop_body (loop);
10938   unsigned nbbs = loop->num_nodes;
10939   unsigned i;
10940   basic_block bb;
10941   class loop *bb_loop;
10942   gimple_stmt_iterator gsi;
10943   gimple *stmt;
10944   auto_vec<gimple *> worklist;
10945   auto_purge_vect_location sentinel;
10946
10947   vect_location = find_loop_location (loop);
10948   /* Pick up all masked stores in loop if any.  */
10949   for (i = 0; i < nbbs; i++)
10950     {
10951       bb = bbs[i];
10952       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
10953            gsi_next (&gsi))
10954         {
10955           stmt = gsi_stmt (gsi);
10956           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
10957             worklist.safe_push (stmt);
10958         }
10959     }
10960
10961   free (bbs);
10962   if (worklist.is_empty ())
10963     return;
10964
10965   /* Loop has masked stores.  */
10966   while (!worklist.is_empty ())
10967     {
10968       gimple *last, *last_store;
10969       edge e, efalse;
10970       tree mask;
10971       basic_block store_bb, join_bb;
10972       gimple_stmt_iterator gsi_to;
10973       tree vdef, new_vdef;
10974       gphi *phi;
10975       tree vectype;
10976       tree zero;
10977
10978       last = worklist.pop ();
10979       mask = gimple_call_arg (last, 2);
10980       bb = gimple_bb (last);
10981       /* Create then_bb and if-then structure in CFG, then_bb belongs to
10982          the same loop as if_bb.  It could be different to LOOP when two
10983          level loop-nest is vectorized and mask_store belongs to the inner
10984          one.  */
10985       e = split_block (bb, last);
10986       bb_loop = bb->loop_father;
10987       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
10988       join_bb = e->dest;
10989       store_bb = create_empty_bb (bb);
10990       add_bb_to_loop (store_bb, bb_loop);
10991       e->flags = EDGE_TRUE_VALUE;
10992       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
10993       /* Put STORE_BB to likely part.  */
10994       efalse->probability = profile_probability::unlikely ();
10995       store_bb->count = efalse->count ();
10996       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
10997       if (dom_info_available_p (CDI_DOMINATORS))
10998         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
10999       if (dump_enabled_p ())
11000         dump_printf_loc (MSG_NOTE, vect_location,
11001                          "Create new block %d to sink mask stores.",
11002                          store_bb->index);
11003       /* Create vector comparison with boolean result.  */
11004       vectype = TREE_TYPE (mask);
11005       zero = build_zero_cst (vectype);
11006       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
11007       gsi = gsi_last_bb (bb);
11008       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
11009       /* Create new PHI node for vdef of the last masked store:
11010          .MEM_2 = VDEF <.MEM_1>
11011          will be converted to
11012          .MEM.3 = VDEF <.MEM_1>
11013          and new PHI node will be created in join bb
11014          .MEM_2 = PHI <.MEM_1, .MEM_3>
11015       */
11016       vdef = gimple_vdef (last);
11017       new_vdef = make_ssa_name (gimple_vop (cfun), last);
11018       gimple_set_vdef (last, new_vdef);
11019       phi = create_phi_node (vdef, join_bb);
11020       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
11021
11022       /* Put all masked stores with the same mask to STORE_BB if possible.  */
11023       while (true)
11024         {
11025           gimple_stmt_iterator gsi_from;
11026           gimple *stmt1 = NULL;
11027
11028           /* Move masked store to STORE_BB.  */
11029           last_store = last;
11030           gsi = gsi_for_stmt (last);
11031           gsi_from = gsi;
11032           /* Shift GSI to the previous stmt for further traversal.  */
11033           gsi_prev (&gsi);
11034           gsi_to = gsi_start_bb (store_bb);
11035           gsi_move_before (&gsi_from, &gsi_to);
11036           /* Setup GSI_TO to the non-empty block start.  */
11037           gsi_to = gsi_start_bb (store_bb);
11038           if (dump_enabled_p ())
11039             dump_printf_loc (MSG_NOTE, vect_location,
11040                              "Move stmt to created bb\n%G", last);
11041           /* Move all stored value producers if possible.  */
11042           while (!gsi_end_p (gsi))
11043             {
11044               tree lhs;
11045               imm_use_iterator imm_iter;
11046               use_operand_p use_p;
11047               bool res;
11048
11049               /* Skip debug statements.  */
11050               if (is_gimple_debug (gsi_stmt (gsi)))
11051                 {
11052                   gsi_prev (&gsi);
11053                   continue;
11054                 }
11055               stmt1 = gsi_stmt (gsi);
11056               /* Do not consider statements writing to memory or having
11057                  volatile operand.  */
11058               if (gimple_vdef (stmt1)
11059                   || gimple_has_volatile_ops (stmt1))
11060                 break;
11061               gsi_from = gsi;
11062               gsi_prev (&gsi);
11063               lhs = gimple_get_lhs (stmt1);
11064               if (!lhs)
11065                 break;
11066
11067               /* LHS of vectorized stmt must be SSA_NAME.  */
11068               if (TREE_CODE (lhs) != SSA_NAME)
11069                 break;
11070
11071               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11072                 {
11073                   /* Remove dead scalar statement.  */
11074                   if (has_zero_uses (lhs))
11075                     {
11076                       gsi_remove (&gsi_from, true);
11077                       continue;
11078                     }
11079                 }
11080
11081               /* Check that LHS does not have uses outside of STORE_BB.  */
11082               res = true;
11083               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
11084                 {
11085                   gimple *use_stmt;
11086                   use_stmt = USE_STMT (use_p);
11087                   if (is_gimple_debug (use_stmt))
11088                     continue;
11089                   if (gimple_bb (use_stmt) != store_bb)
11090                     {
11091                       res = false;
11092                       break;
11093                     }
11094                 }
11095               if (!res)
11096                 break;
11097
11098               if (gimple_vuse (stmt1)
11099                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
11100                 break;
11101
11102               /* Can move STMT1 to STORE_BB.  */
11103               if (dump_enabled_p ())
11104                 dump_printf_loc (MSG_NOTE, vect_location,
11105                                  "Move stmt to created bb\n%G", stmt1);
11106               gsi_move_before (&gsi_from, &gsi_to);
11107               /* Shift GSI_TO for further insertion.  */
11108               gsi_prev (&gsi_to);
11109             }
11110           /* Put other masked stores with the same mask to STORE_BB.  */
11111           if (worklist.is_empty ()
11112               || gimple_call_arg (worklist.last (), 2) != mask
11113               || worklist.last () != stmt1)
11114             break;
11115           last = worklist.pop ();
11116         }
11117       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
11118     }
11119 }
11120
11121 /* Decide whether it is possible to use a zero-based induction variable
11122    when vectorizing LOOP_VINFO with partial vectors.  If it is, return
11123    the value that the induction variable must be able to hold in order
11124    to ensure that the rgroups eventually have no active vector elements.
11125    Return -1 otherwise.  */
11126
11127 widest_int
11128 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
11129 {
11130   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
11131   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11132   unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
11133
11134   /* Calculate the value that the induction variable must be able
11135      to hit in order to ensure that we end the loop with an all-false mask.
11136      This involves adding the maximum number of inactive trailing scalar
11137      iterations.  */
11138   widest_int iv_limit = -1;
11139   if (max_loop_iterations (loop, &iv_limit))
11140     {
11141       if (niters_skip)
11142         {
11143           /* Add the maximum number of skipped iterations to the
11144              maximum iteration count.  */
11145           if (TREE_CODE (niters_skip) == INTEGER_CST)
11146             iv_limit += wi::to_widest (niters_skip);
11147           else
11148             iv_limit += max_vf - 1;
11149         }
11150       else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
11151         /* Make a conservatively-correct assumption.  */
11152         iv_limit += max_vf - 1;
11153
11154       /* IV_LIMIT is the maximum number of latch iterations, which is also
11155          the maximum in-range IV value.  Round this value down to the previous
11156          vector alignment boundary and then add an extra full iteration.  */
11157       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11158       iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
11159     }
11160   return iv_limit;
11161 }
11162
11163 /* For the given rgroup_controls RGC, check whether an induction variable
11164    would ever hit a value that produces a set of all-false masks or zero
11165    lengths before wrapping around.  Return true if it's possible to wrap
11166    around before hitting the desirable value, otherwise return false.  */
11167
11168 bool
11169 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
11170 {
11171   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
11172
11173   if (iv_limit == -1)
11174     return true;
11175
11176   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
11177   unsigned int compare_precision = TYPE_PRECISION (compare_type);
11178   unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
11179
11180   if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
11181     return true;
11182
11183   return false;
11184 }