gcc/tree-vect-loop.cc

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2024 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #define INCLUDE_ALGORITHM
  23 #include "config.h"
  24 #include "system.h"
  25 #include "coretypes.h"
  26 #include "backend.h"
  27 #include "target.h"
  28 #include "rtl.h"
  29 #include "tree.h"
  30 #include "gimple.h"
  31 #include "cfghooks.h"
  32 #include "tree-pass.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "memmodel.h"
  36 #include "optabs.h"
  37 #include "diagnostic-core.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "cfganal.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop-niter.h"
  47 #include "tree-ssa-loop.h"
  48 #include "cfgloop.h"
  49 #include "tree-scalar-evolution.h"
  50 #include "tree-vectorizer.h"
  51 #include "gimple-fold.h"
  52 #include "cgraph.h"
  53 #include "tree-cfg.h"
  54 #include "tree-if-conv.h"
  55 #include "internal-fn.h"
  56 #include "tree-vector-builder.h"
  57 #include "vec-perm-indices.h"
  58 #include "tree-eh.h"
  59 #include "case-cfn-macros.h"
  60 #include "langhooks.h"
  61
  62 /* Loop Vectorization Pass.
  63
  64    This pass tries to vectorize loops.
  65
  66    For example, the vectorizer transforms the following simple loop:
  67
  68         short a[N]; short b[N]; short c[N]; int i;
  69
  70         for (i=0; i<N; i++){
  71           a[i] = b[i] + c[i];
  72         }
  73
  74    as if it was manually vectorized by rewriting the source code into:
  75
  76         typedef int __attribute__((mode(V8HI))) v8hi;
  77         short a[N];  short b[N]; short c[N];   int i;
  78         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  79         v8hi va, vb, vc;
  80
  81         for (i=0; i<N/8; i++){
  82           vb = pb[i];
  83           vc = pc[i];
  84           va = vb + vc;
  85           pa[i] = va;
  86         }
  87
  88         The main entry to this pass is vectorize_loops(), in which
  89    the vectorizer applies a set of analyses on a given set of loops,
  90    followed by the actual vectorization transformation for the loops that
  91    had successfully passed the analysis phase.
  92         Throughout this pass we make a distinction between two types of
  93    data: scalars (which are represented by SSA_NAMES), and memory references
  94    ("data-refs").  These two types of data require different handling both
  95    during analysis and transformation. The types of data-refs that the
  96    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  97    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  98    accesses are required to have a simple (consecutive) access pattern.
  99
 100    Analysis phase:
 101    ===============
 102         The driver for the analysis phase is vect_analyze_loop().
 103    It applies a set of analyses, some of which rely on the scalar evolution
 104    analyzer (scev) developed by Sebastian Pop.
 105
 106         During the analysis phase the vectorizer records some information
 107    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 108    loop, as well as general information about the loop as a whole, which is
 109    recorded in a "loop_vec_info" struct attached to each loop.
 110
 111    Transformation phase:
 112    =====================
 113         The loop transformation phase scans all the stmts in the loop, and
 114    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 115    the loop that needs to be vectorized.  It inserts the vector code sequence
 116    just before the scalar stmt S, and records a pointer to the vector code
 117    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 118    attached to S).  This pointer will be used for the vectorization of following
 119    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 120    otherwise, we rely on dead code elimination for removing it.
 121
 122         For example, say stmt S1 was vectorized into stmt VS1:
 123
 124    VS1: vb = px[i];
 125    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 126    S2:  a = b;
 127
 128    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 129    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 130    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 131    resulting sequence would be:
 132
 133    VS1: vb = px[i];
 134    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 135    VS2: va = vb;
 136    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 137
 138         Operands that are not SSA_NAMEs, are data-refs that appear in
 139    load/store operations (like 'x[i]' in S1), and are handled differently.
 140
 141    Target modeling:
 142    =================
 143         Currently the only target specific information that is used is the
 144    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 145    Targets that can support different sizes of vectors, for now will need
 146    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 147    flexibility will be added in the future.
 148
 149         Since we only vectorize operations which vector form can be
 150    expressed using existing tree codes, to verify that an operation is
 151    supported, the vectorizer checks the relevant optab at the relevant
 152    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 153    the value found is CODE_FOR_nothing, then there's no target support, and
 154    we can't vectorize the stmt.
 155
 156    For additional information on this project see:
 157    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 158 */
 159
 160 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
 161                                                 unsigned *);
 162 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
 163                                                bool *, bool *, bool);
 164
 165 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 166    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 167    may already be set for general statements (not just data refs).  */
 168
 169 static opt_result
 170 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
 171                               bool vectype_maybe_set_p,
 172                               poly_uint64 *vf)
 173 {
 174   gimple *stmt = stmt_info->stmt;
 175
 176   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 177        && !STMT_VINFO_LIVE_P (stmt_info))
 178       || gimple_clobber_p (stmt))
 179     {
 180       if (dump_enabled_p ())
 181         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 182       return opt_result::success ();
 183     }
 184
 185   tree stmt_vectype, nunits_vectype;
 186   opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
 187                                                    &stmt_vectype,
 188                                                    &nunits_vectype);
 189   if (!res)
 190     return res;
 191
 192   if (stmt_vectype)
 193     {
 194       if (STMT_VINFO_VECTYPE (stmt_info))
 195         /* The only case when a vectype had been already set is for stmts
 196            that contain a data ref, or for "pattern-stmts" (stmts generated
 197            by the vectorizer to represent/replace a certain idiom).  */
 198         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 199                      || vectype_maybe_set_p)
 200                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 201       else
 202         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 203     }
 204
 205   if (nunits_vectype)
 206     vect_update_max_nunits (vf, nunits_vectype);
 207
 208   return opt_result::success ();
 209 }
 210
 211 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 212    types of STMT_INFO and all attached pattern statements and update
 213    the vectorization factor VF accordingly.  Return true on success
 214    or false if something prevented vectorization.  */
 215
 216 static opt_result
 217 vect_determine_vf_for_stmt (vec_info *vinfo,
 218                             stmt_vec_info stmt_info, poly_uint64 *vf)
 219 {
 220   if (dump_enabled_p ())
 221     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
 222                      stmt_info->stmt);
 223   opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
 224   if (!res)
 225     return res;
 226
 227   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 228       && STMT_VINFO_RELATED_STMT (stmt_info))
 229     {
 230       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 231       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
 232
 233       /* If a pattern statement has def stmts, analyze them too.  */
 234       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 235            !gsi_end_p (si); gsi_next (&si))
 236         {
 237           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 238           if (dump_enabled_p ())
 239             dump_printf_loc (MSG_NOTE, vect_location,
 240                              "==> examining pattern def stmt: %G",
 241                              def_stmt_info->stmt);
 242           res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
 243           if (!res)
 244             return res;
 245         }
 246
 247       if (dump_enabled_p ())
 248         dump_printf_loc (MSG_NOTE, vect_location,
 249                          "==> examining pattern statement: %G",
 250                          stmt_info->stmt);
 251       res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
 252       if (!res)
 253         return res;
 254     }
 255
 256   return opt_result::success ();
 257 }
 258
 259 /* Function vect_determine_vectorization_factor
 260
 261    Determine the vectorization factor (VF).  VF is the number of data elements
 262    that are operated upon in parallel in a single iteration of the vectorized
 263    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 264    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 265    elements can fit in a single vector register.
 266
 267    We currently support vectorization of loops in which all types operated upon
 268    are of the same size.  Therefore this function currently sets VF according to
 269    the size of the types operated upon, and fails if there are multiple sizes
 270    in the loop.
 271
 272    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 273    original loop:
 274         for (i=0; i<N; i++){
 275           a[i] = b[i] + c[i];
 276         }
 277
 278    vectorized loop:
 279         for (i=0; i<N; i+=VF){
 280           a[i:VF] = b[i:VF] + c[i:VF];
 281         }
 282 */
 283
 284 static opt_result
 285 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 286 {
 287   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 288   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 289   unsigned nbbs = loop->num_nodes;
 290   poly_uint64 vectorization_factor = 1;
 291   tree scalar_type = NULL_TREE;
 292   gphi *phi;
 293   tree vectype;
 294   stmt_vec_info stmt_info;
 295   unsigned i;
 296
 297   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 298
 299   for (i = 0; i < nbbs; i++)
 300     {
 301       basic_block bb = bbs[i];
 302
 303       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 304            gsi_next (&si))
 305         {
 306           phi = si.phi ();
 307           stmt_info = loop_vinfo->lookup_stmt (phi);
 308           if (dump_enabled_p ())
 309             dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
 310                              (gimple *) phi);
 311
 312           gcc_assert (stmt_info);
 313
 314           if (STMT_VINFO_RELEVANT_P (stmt_info)
 315               || STMT_VINFO_LIVE_P (stmt_info))
 316             {
 317               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 318               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 319
 320               if (dump_enabled_p ())
 321                 dump_printf_loc (MSG_NOTE, vect_location,
 322                                  "get vectype for scalar type:  %T\n",
 323                                  scalar_type);
 324
 325               vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 326               if (!vectype)
 327                 return opt_result::failure_at (phi,
 328                                                "not vectorized: unsupported "
 329                                                "data-type %T\n",
 330                                                scalar_type);
 331               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 332
 333               if (dump_enabled_p ())
 334                 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
 335                                  vectype);
 336
 337               if (dump_enabled_p ())
 338                 {
 339                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 340                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 341                   dump_printf (MSG_NOTE, "\n");
 342                 }
 343
 344               vect_update_max_nunits (&vectorization_factor, vectype);
 345             }
 346         }
 347
 348       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 349            gsi_next (&si))
 350         {
 351           if (is_gimple_debug (gsi_stmt (si)))
 352             continue;
 353           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 354           opt_result res
 355             = vect_determine_vf_for_stmt (loop_vinfo,
 356                                           stmt_info, &vectorization_factor);
 357           if (!res)
 358             return res;
 359         }
 360     }
 361
 362   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 363   if (dump_enabled_p ())
 364     {
 365       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 366       dump_dec (MSG_NOTE, vectorization_factor);
 367       dump_printf (MSG_NOTE, "\n");
 368     }
 369
 370   if (known_le (vectorization_factor, 1U))
 371     return opt_result::failure_at (vect_location,
 372                                    "not vectorized: unsupported data-type\n");
 373   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 374   return opt_result::success ();
 375 }
 376
 377
 378 /* Function vect_is_simple_iv_evolution.
 379
 380    FORNOW: A simple evolution of an induction variables in the loop is
 381    considered a polynomial evolution.  */
 382
 383 static bool
 384 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 385                              tree * step)
 386 {
 387   tree init_expr;
 388   tree step_expr;
 389   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 390   basic_block bb;
 391
 392   /* When there is no evolution in this loop, the evolution function
 393      is not "simple".  */
 394   if (evolution_part == NULL_TREE)
 395     return false;
 396
 397   /* When the evolution is a polynomial of degree >= 2
 398      the evolution function is not "simple".  */
 399   if (tree_is_chrec (evolution_part))
 400     return false;
 401
 402   step_expr = evolution_part;
 403   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 404
 405   if (dump_enabled_p ())
 406     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
 407                      step_expr, init_expr);
 408
 409   *init = init_expr;
 410   *step = step_expr;
 411
 412   if (TREE_CODE (step_expr) != INTEGER_CST
 413       && (TREE_CODE (step_expr) != SSA_NAME
 414           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 415               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 416           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 417               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 418                   || !flag_associative_math)))
 419       && (TREE_CODE (step_expr) != REAL_CST
 420           || !flag_associative_math))
 421     {
 422       if (dump_enabled_p ())
 423         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 424                          "step unknown.\n");
 425       return false;
 426     }
 427
 428   return true;
 429 }
 430
 431 /* Function vect_is_nonlinear_iv_evolution
 432
 433    Only support nonlinear induction for integer type
 434    1. neg
 435    2. mul by constant
 436    3. lshift/rshift by constant.
 437
 438    For neg induction, return a fake step as integer -1.  */
 439 static bool
 440 vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
 441                                 gphi* loop_phi_node, tree *init, tree *step)
 442 {
 443   tree init_expr, ev_expr, result, op1, op2;
 444   gimple* def;
 445
 446   if (gimple_phi_num_args (loop_phi_node) != 2)
 447     return false;
 448
 449   init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop));
 450   ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop));
 451
 452   /* Support nonlinear induction only for integer type.  */
 453   if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
 454     return false;
 455
 456   *init = init_expr;
 457   result = PHI_RESULT (loop_phi_node);
 458
 459   if (TREE_CODE (ev_expr) != SSA_NAME
 460       || ((def = SSA_NAME_DEF_STMT (ev_expr)), false)
 461       || !is_gimple_assign (def))
 462     return false;
 463
 464   enum tree_code t_code = gimple_assign_rhs_code (def);
 465   switch (t_code)
 466     {
 467     case NEGATE_EXPR:
 468       if (gimple_assign_rhs1 (def) != result)
 469         return false;
 470       *step = build_int_cst (TREE_TYPE (init_expr), -1);
 471       STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
 472       break;
 473
 474     case RSHIFT_EXPR:
 475     case LSHIFT_EXPR:
 476     case MULT_EXPR:
 477       op1 = gimple_assign_rhs1 (def);
 478       op2 = gimple_assign_rhs2 (def);
 479       if (TREE_CODE (op2) != INTEGER_CST
 480           || op1 != result)
 481         return false;
 482       *step = op2;
 483       if (t_code == LSHIFT_EXPR)
 484         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
 485       else if (t_code == RSHIFT_EXPR)
 486         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
 487       /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul.  */
 488       else
 489         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
 490       break;
 491
 492     default:
 493       return false;
 494     }
 495
 496   STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = *init;
 497   STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = *step;
 498
 499   return true;
 500 }
 501
 502 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
 503    what we are assuming is a double reduction.  For example, given
 504    a structure like this:
 505
 506       outer1:
 507         x_1 = PHI <x_4(outer2), ...>;
 508         ...
 509
 510       inner:
 511         x_2 = PHI <x_1(outer1), ...>;
 512         ...
 513         x_3 = ...;
 514         ...
 515
 516       outer2:
 517         x_4 = PHI <x_3(inner)>;
 518         ...
 519
 520    outer loop analysis would treat x_1 as a double reduction phi and
 521    this function would then return true for x_2.  */
 522
 523 static bool
 524 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
 525 {
 526   use_operand_p use_p;
 527   ssa_op_iter op_iter;
 528   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
 529     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
 530       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
 531         return true;
 532   return false;
 533 }
 534
 535 /* Returns true if Phi is a first-order recurrence. A first-order
 536    recurrence is a non-reduction recurrence relation in which the value of
 537    the recurrence in the current loop iteration equals a value defined in
 538    the previous iteration.  */
 539
 540 static bool
 541 vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop,
 542                                    gphi *phi)
 543 {
 544   /* A nested cycle isn't vectorizable as first order recurrence.  */
 545   if (LOOP_VINFO_LOOP (loop_vinfo) != loop)
 546     return false;
 547
 548   /* Ensure the loop latch definition is from within the loop.  */
 549   edge latch = loop_latch_edge (loop);
 550   tree ldef = PHI_ARG_DEF_FROM_EDGE (phi, latch);
 551   if (TREE_CODE (ldef) != SSA_NAME
 552       || SSA_NAME_IS_DEFAULT_DEF (ldef)
 553       || is_a <gphi *> (SSA_NAME_DEF_STMT (ldef))
 554       || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (ldef))))
 555     return false;
 556
 557   tree def = gimple_phi_result (phi);
 558
 559   /* Ensure every use_stmt of the phi node is dominated by the latch
 560      definition.  */
 561   imm_use_iterator imm_iter;
 562   use_operand_p use_p;
 563   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
 564     if (!is_gimple_debug (USE_STMT (use_p))
 565         && (SSA_NAME_DEF_STMT (ldef) == USE_STMT (use_p)
 566             || !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (ldef),
 567                                             USE_STMT (use_p))))
 568       return false;
 569
 570   /* First-order recurrence autovectorization needs shuffle vector.  */
 571   tree scalar_type = TREE_TYPE (def);
 572   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 573   if (!vectype)
 574     return false;
 575
 576   return true;
 577 }
 578
 579 /* Function vect_analyze_scalar_cycles_1.
 580
 581    Examine the cross iteration def-use cycles of scalar variables
 582    in LOOP.  LOOP_VINFO represents the loop that is now being
 583    considered for vectorization (can be LOOP, or an outer-loop
 584    enclosing LOOP).  SLP indicates there will be some subsequent
 585    slp analyses or not.  */
 586
 587 static void
 588 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop,
 589                               bool slp)
 590 {
 591   basic_block bb = loop->header;
 592   tree init, step;
 593   auto_vec<stmt_vec_info, 64> worklist;
 594   gphi_iterator gsi;
 595   bool double_reduc, reduc_chain;
 596
 597   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 598
 599   /* First - identify all inductions.  Reduction detection assumes that all the
 600      inductions have been identified, therefore, this order must not be
 601      changed.  */
 602   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 603     {
 604       gphi *phi = gsi.phi ();
 605       tree access_fn = NULL;
 606       tree def = PHI_RESULT (phi);
 607       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 608
 609       if (dump_enabled_p ())
 610         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
 611                          (gimple *) phi);
 612
 613       /* Skip virtual phi's.  The data dependences that are associated with
 614          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 615       if (virtual_operand_p (def))
 616         continue;
 617
 618       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 619
 620       /* Analyze the evolution function.  */
 621       access_fn = analyze_scalar_evolution (loop, def);
 622       if (access_fn)
 623         {
 624           STRIP_NOPS (access_fn);
 625           if (dump_enabled_p ())
 626             dump_printf_loc (MSG_NOTE, vect_location,
 627                              "Access function of PHI: %T\n", access_fn);
 628           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 629             = initial_condition_in_loop_num (access_fn, loop->num);
 630           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 631             = evolution_part_in_loop_num (access_fn, loop->num);
 632         }
 633
 634       if ((!access_fn
 635            || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
 636            || !vect_is_simple_iv_evolution (loop->num, access_fn,
 637                                             &init, &step)
 638            || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 639                && TREE_CODE (step) != INTEGER_CST))
 640           /* Only handle nonlinear iv for same loop.  */
 641           && (LOOP_VINFO_LOOP (loop_vinfo) != loop
 642               || !vect_is_nonlinear_iv_evolution (loop, stmt_vinfo,
 643                                                   phi, &init, &step)))
 644         {
 645           worklist.safe_push (stmt_vinfo);
 646           continue;
 647         }
 648
 649       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 650                   != NULL_TREE);
 651       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 652
 653       if (dump_enabled_p ())
 654         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 655       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 656     }
 657
 658
 659   /* Second - identify all reductions and nested cycles.  */
 660   while (worklist.length () > 0)
 661     {
 662       stmt_vec_info stmt_vinfo = worklist.pop ();
 663       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
 664       tree def = PHI_RESULT (phi);
 665
 666       if (dump_enabled_p ())
 667         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
 668                          (gimple *) phi);
 669
 670       gcc_assert (!virtual_operand_p (def)
 671                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 672
 673       stmt_vec_info reduc_stmt_info
 674         = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
 675                                     &reduc_chain, slp);
 676       if (reduc_stmt_info)
 677         {
 678           STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
 679           STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
 680           if (double_reduc)
 681             {
 682               if (dump_enabled_p ())
 683                 dump_printf_loc (MSG_NOTE, vect_location,
 684                                  "Detected double reduction.\n");
 685
 686               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 687               STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
 688             }
 689           else
 690             {
 691               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 692                 {
 693                   if (dump_enabled_p ())
 694                     dump_printf_loc (MSG_NOTE, vect_location,
 695                                      "Detected vectorizable nested cycle.\n");
 696
 697                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 698                 }
 699               else
 700                 {
 701                   if (dump_enabled_p ())
 702                     dump_printf_loc (MSG_NOTE, vect_location,
 703                                      "Detected reduction.\n");
 704
 705                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 706                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
 707                   /* Store the reduction cycles for possible vectorization in
 708                      loop-aware SLP if it was not detected as reduction
 709                      chain.  */
 710                   if (! reduc_chain)
 711                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
 712                       (reduc_stmt_info);
 713                 }
 714             }
 715         }
 716       else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi))
 717         STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence;
 718       else
 719         if (dump_enabled_p ())
 720           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 721                            "Unknown def-use cycle pattern.\n");
 722     }
 723 }
 724
 725
 726 /* Function vect_analyze_scalar_cycles.
 727
 728    Examine the cross iteration def-use cycles of scalar variables, by
 729    analyzing the loop-header PHIs of scalar variables.  Classify each
 730    cycle as one of the following: invariant, induction, reduction, unknown.
 731    We do that for the loop represented by LOOP_VINFO, and also to its
 732    inner-loop, if exists.
 733    Examples for scalar cycles:
 734
 735    Example1: reduction:
 736
 737               loop1:
 738               for (i=0; i<N; i++)
 739                  sum += a[i];
 740
 741    Example2: induction:
 742
 743               loop2:
 744               for (i=0; i<N; i++)
 745                  a[i] = i;  */
 746
 747 static void
 748 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo, bool slp)
 749 {
 750   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 751
 752   vect_analyze_scalar_cycles_1 (loop_vinfo, loop, slp);
 753
 754   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 755      Reductions in such inner-loop therefore have different properties than
 756      the reductions in the nest that gets vectorized:
 757      1. When vectorized, they are executed in the same order as in the original
 758         scalar loop, so we can't change the order of computation when
 759         vectorizing them.
 760      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 761         current checks are too strict.  */
 762
 763   if (loop->inner)
 764     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner, slp);
 765 }
 766
 767 /* Transfer group and reduction information from STMT_INFO to its
 768    pattern stmt.  */
 769
 770 static void
 771 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
 772 {
 773   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
 774   stmt_vec_info stmtp;
 775   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
 776               && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
 777   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
 778   do
 779     {
 780       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
 781       gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
 782                            == STMT_VINFO_DEF_TYPE (stmt_info));
 783       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
 784       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
 785       if (stmt_info)
 786         REDUC_GROUP_NEXT_ELEMENT (stmtp)
 787           = STMT_VINFO_RELATED_STMT (stmt_info);
 788     }
 789   while (stmt_info);
 790 }
 791
 792 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 793
 794 static void
 795 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 796 {
 797   stmt_vec_info first;
 798   unsigned i;
 799
 800   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 801     {
 802       stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
 803       while (next)
 804         {
 805           if ((STMT_VINFO_IN_PATTERN_P (next)
 806                != STMT_VINFO_IN_PATTERN_P (first))
 807               || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
 808             break;
 809           next = REDUC_GROUP_NEXT_ELEMENT (next);
 810         }
 811       /* If all reduction chain members are well-formed patterns adjust
 812          the group to group the pattern stmts instead.  */
 813       if (! next
 814           && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
 815         {
 816           if (STMT_VINFO_IN_PATTERN_P (first))
 817             {
 818               vect_fixup_reduc_chain (first);
 819               LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 820                 = STMT_VINFO_RELATED_STMT (first);
 821             }
 822         }
 823       /* If not all stmt in the chain are patterns or if we failed
 824          to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
 825          it as regular reduction instead.  */
 826       else
 827         {
 828           stmt_vec_info vinfo = first;
 829           stmt_vec_info last = NULL;
 830           while (vinfo)
 831             {
 832               next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
 833               REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
 834               REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
 835               last = vinfo;
 836               vinfo = next;
 837             }
 838           STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
 839             = vect_internal_def;
 840           loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
 841           LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
 842           --i;
 843         }
 844     }
 845 }
 846
 847 /* Function vect_get_loop_niters.
 848
 849    Determine how many iterations the loop is executed and place it
 850    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 851    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 852    niter information holds in ASSUMPTIONS.
 853
 854    Return the loop exit conditions.  */
 855
 856
 857 static vec<gcond *>
 858 vect_get_loop_niters (class loop *loop, const_edge main_exit, tree *assumptions,
 859                       tree *number_of_iterations, tree *number_of_iterationsm1)
 860 {
 861   auto_vec<edge> exits = get_loop_exit_edges (loop);
 862   vec<gcond *> conds;
 863   conds.create (exits.length ());
 864   class tree_niter_desc niter_desc;
 865   tree niter_assumptions, niter, may_be_zero;
 866
 867   *assumptions = boolean_true_node;
 868   *number_of_iterationsm1 = chrec_dont_know;
 869   *number_of_iterations = chrec_dont_know;
 870
 871   DUMP_VECT_SCOPE ("get_loop_niters");
 872
 873   if (exits.is_empty ())
 874     return conds;
 875
 876   if (dump_enabled_p ())
 877     dump_printf_loc (MSG_NOTE, vect_location, "Loop has %d exits.\n",
 878                      exits.length ());
 879
 880   edge exit;
 881   unsigned int i;
 882   FOR_EACH_VEC_ELT (exits, i, exit)
 883     {
 884       gcond *cond = get_loop_exit_condition (exit);
 885       if (cond)
 886         conds.safe_push (cond);
 887
 888       if (dump_enabled_p ())
 889         dump_printf_loc (MSG_NOTE, vect_location, "Analyzing exit %d...\n", i);
 890
 891       if (exit != main_exit)
 892         continue;
 893
 894       may_be_zero = NULL_TREE;
 895       if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 896           || chrec_contains_undetermined (niter_desc.niter))
 897         continue;
 898
 899       niter_assumptions = niter_desc.assumptions;
 900       may_be_zero = niter_desc.may_be_zero;
 901       niter = niter_desc.niter;
 902
 903       if (may_be_zero && integer_zerop (may_be_zero))
 904         may_be_zero = NULL_TREE;
 905
 906       if (may_be_zero)
 907         {
 908           if (COMPARISON_CLASS_P (may_be_zero))
 909             {
 910               /* Try to combine may_be_zero with assumptions, this can simplify
 911                  computation of niter expression.  */
 912               if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 913                 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 914                                                  niter_assumptions,
 915                                                  fold_build1 (TRUTH_NOT_EXPR,
 916                                                               boolean_type_node,
 917                                                               may_be_zero));
 918               else
 919                 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 920                                      build_int_cst (TREE_TYPE (niter), 0),
 921                                      rewrite_to_non_trapping_overflow (niter));
 922
 923               may_be_zero = NULL_TREE;
 924             }
 925           else if (integer_nonzerop (may_be_zero))
 926             {
 927               *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 928               *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 929               continue;
 930             }
 931           else
 932             continue;
 933        }
 934
 935       /* Loop assumptions are based off the normal exit.  */
 936       *assumptions = niter_assumptions;
 937       *number_of_iterationsm1 = niter;
 938
 939       /* We want the number of loop header executions which is the number
 940          of latch executions plus one.
 941          ???  For UINT_MAX latch executions this number overflows to zero
 942          for loops like do { n++; } while (n != 0);  */
 943       if (niter && !chrec_contains_undetermined (niter))
 944         {
 945           niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter),
 946                                unshare_expr (niter),
 947                                build_int_cst (TREE_TYPE (niter), 1));
 948           if (TREE_CODE (niter) == INTEGER_CST
 949               && TREE_CODE (*number_of_iterationsm1) != INTEGER_CST)
 950             {
 951               /* If we manage to fold niter + 1 into INTEGER_CST even when
 952                  niter is some complex expression, ensure back
 953                  *number_of_iterationsm1 is an INTEGER_CST as well.  See
 954                  PR113210.  */
 955               *number_of_iterationsm1
 956                 = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), niter,
 957                                build_minus_one_cst (TREE_TYPE (niter)));
 958             }
 959         }
 960       *number_of_iterations = niter;
 961     }
 962
 963   if (dump_enabled_p ())
 964     dump_printf_loc (MSG_NOTE, vect_location, "All loop exits successfully analyzed.\n");
 965
 966   return conds;
 967 }
 968
 969 /*  Determine the main loop exit for the vectorizer.  */
 970
 971 edge
 972 vec_init_loop_exit_info (class loop *loop)
 973 {
 974   /* Before we begin we must first determine which exit is the main one and
 975      which are auxilary exits.  */
 976   auto_vec<edge> exits = get_loop_exit_edges (loop);
 977   if (exits.length () == 1)
 978     return exits[0];
 979
 980   /* If we have multiple exits we only support counting IV at the moment.
 981      Analyze all exits and return the last one we can analyze.  */
 982   class tree_niter_desc niter_desc;
 983   edge candidate = NULL;
 984   for (edge exit : exits)
 985     {
 986       if (!get_loop_exit_condition (exit))
 987         continue;
 988
 989       if (number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 990           && !chrec_contains_undetermined (niter_desc.niter))
 991         {
 992           tree may_be_zero = niter_desc.may_be_zero;
 993           if ((integer_zerop (may_be_zero)
 994                /* As we are handling may_be_zero that's not false by
 995                   rewriting niter to may_be_zero ? 0 : niter we require
 996                   an empty latch.  */
 997                || (single_pred_p (loop->latch)
 998                    && exit->src == single_pred (loop->latch)
 999                    && (integer_nonzerop (may_be_zero)
1000                        || COMPARISON_CLASS_P (may_be_zero))))
1001               && (!candidate
1002                   || dominated_by_p (CDI_DOMINATORS, exit->src,
1003                                      candidate->src)))
1004             candidate = exit;
1005         }
1006     }
1007
1008   return candidate;
1009 }
1010
1011 /* Function bb_in_loop_p
1012
1013    Used as predicate for dfs order traversal of the loop bbs.  */
1014
1015 static bool
1016 bb_in_loop_p (const_basic_block bb, const void *data)
1017 {
1018   const class loop *const loop = (const class loop *)data;
1019   if (flow_bb_inside_loop_p (loop, bb))
1020     return true;
1021   return false;
1022 }
1023
1024
1025 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1026    stmt_vec_info structs for all the stmts in LOOP_IN.  */
1027
1028 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
1029   : vec_info (vec_info::loop, shared),
1030     loop (loop_in),
1031     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
1032     num_itersm1 (NULL_TREE),
1033     num_iters (NULL_TREE),
1034     num_iters_unchanged (NULL_TREE),
1035     num_iters_assumptions (NULL_TREE),
1036     vector_costs (nullptr),
1037     scalar_costs (nullptr),
1038     th (0),
1039     versioning_threshold (0),
1040     vectorization_factor (0),
1041     main_loop_edge (nullptr),
1042     skip_main_loop_edge (nullptr),
1043     skip_this_loop_edge (nullptr),
1044     reusable_accumulators (),
1045     suggested_unroll_factor (1),
1046     max_vectorization_factor (0),
1047     mask_skip_niters (NULL_TREE),
1048     rgroup_compare_type (NULL_TREE),
1049     simd_if_cond (NULL_TREE),
1050     partial_vector_style (vect_partial_vectors_none),
1051     unaligned_dr (NULL),
1052     peeling_for_alignment (0),
1053     ptr_mask (0),
1054     ivexpr_map (NULL),
1055     scan_map (NULL),
1056     slp_unrolling_factor (1),
1057     inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
1058     vectorizable (false),
1059     can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
1060     using_partial_vectors_p (false),
1061     using_decrementing_iv_p (false),
1062     using_select_vl_p (false),
1063     epil_using_partial_vectors_p (false),
1064     partial_load_store_bias (0),
1065     peeling_for_gaps (false),
1066     peeling_for_niter (false),
1067     early_breaks (false),
1068     no_data_dependencies (false),
1069     has_mask_store (false),
1070     scalar_loop_scaling (profile_probability::uninitialized ()),
1071     scalar_loop (NULL),
1072     orig_loop_info (NULL),
1073     vec_loop_iv_exit (NULL),
1074     vec_epilogue_loop_iv_exit (NULL),
1075     scalar_loop_iv_exit (NULL)
1076 {
1077   /* CHECKME: We want to visit all BBs before their successors (except for
1078      latch blocks, for which this assertion wouldn't hold).  In the simple
1079      case of the loop forms we allow, a dfs order of the BBs would the same
1080      as reversed postorder traversal, so we are safe.  */
1081
1082   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1083                                           bbs, loop->num_nodes, loop);
1084   gcc_assert (nbbs == loop->num_nodes);
1085
1086   for (unsigned int i = 0; i < nbbs; i++)
1087     {
1088       basic_block bb = bbs[i];
1089       gimple_stmt_iterator si;
1090
1091       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1092         {
1093           gimple *phi = gsi_stmt (si);
1094           gimple_set_uid (phi, 0);
1095           add_stmt (phi);
1096         }
1097
1098       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1099         {
1100           gimple *stmt = gsi_stmt (si);
1101           gimple_set_uid (stmt, 0);
1102           if (is_gimple_debug (stmt))
1103             continue;
1104           add_stmt (stmt);
1105           /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
1106              third argument is the #pragma omp simd if (x) condition, when 0,
1107              loop shouldn't be vectorized, when non-zero constant, it should
1108              be vectorized normally, otherwise versioned with vectorized loop
1109              done if the condition is non-zero at runtime.  */
1110           if (loop_in->simduid
1111               && is_gimple_call (stmt)
1112               && gimple_call_internal_p (stmt)
1113               && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
1114               && gimple_call_num_args (stmt) >= 3
1115               && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
1116               && (loop_in->simduid
1117                   == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
1118             {
1119               tree arg = gimple_call_arg (stmt, 2);
1120               if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
1121                 simd_if_cond = arg;
1122               else
1123                 gcc_assert (integer_nonzerop (arg));
1124             }
1125         }
1126     }
1127
1128   epilogue_vinfos.create (6);
1129 }
1130
1131 /* Free all levels of rgroup CONTROLS.  */
1132
1133 void
1134 release_vec_loop_controls (vec<rgroup_controls> *controls)
1135 {
1136   rgroup_controls *rgc;
1137   unsigned int i;
1138   FOR_EACH_VEC_ELT (*controls, i, rgc)
1139     rgc->controls.release ();
1140   controls->release ();
1141 }
1142
1143 /* Free all memory used by the _loop_vec_info, as well as all the
1144    stmt_vec_info structs of all the stmts in the loop.  */
1145
1146 _loop_vec_info::~_loop_vec_info ()
1147 {
1148   free (bbs);
1149
1150   release_vec_loop_controls (&masks.rgc_vec);
1151   release_vec_loop_controls (&lens);
1152   delete ivexpr_map;
1153   delete scan_map;
1154   epilogue_vinfos.release ();
1155   delete scalar_costs;
1156   delete vector_costs;
1157
1158   /* When we release an epiloge vinfo that we do not intend to use
1159      avoid clearing AUX of the main loop which should continue to
1160      point to the main loop vinfo since otherwise we'll leak that.  */
1161   if (loop->aux == this)
1162     loop->aux = NULL;
1163 }
1164
1165 /* Return an invariant or register for EXPR and emit necessary
1166    computations in the LOOP_VINFO loop preheader.  */
1167
1168 tree
1169 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
1170 {
1171   if (is_gimple_reg (expr)
1172       || is_gimple_min_invariant (expr))
1173     return expr;
1174
1175   if (! loop_vinfo->ivexpr_map)
1176     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
1177   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
1178   if (! cached)
1179     {
1180       gimple_seq stmts = NULL;
1181       cached = force_gimple_operand (unshare_expr (expr),
1182                                      &stmts, true, NULL_TREE);
1183       if (stmts)
1184         {
1185           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1186           gsi_insert_seq_on_edge_immediate (e, stmts);
1187         }
1188     }
1189   return cached;
1190 }
1191
1192 /* Return true if we can use CMP_TYPE as the comparison type to produce
1193    all masks required to mask LOOP_VINFO.  */
1194
1195 static bool
1196 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1197 {
1198   rgroup_controls *rgm;
1199   unsigned int i;
1200   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1201     if (rgm->type != NULL_TREE
1202         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1203                                             cmp_type, rgm->type,
1204                                             OPTIMIZE_FOR_SPEED))
1205       return false;
1206   return true;
1207 }
1208
1209 /* Calculate the maximum number of scalars per iteration for every
1210    rgroup in LOOP_VINFO.  */
1211
1212 static unsigned int
1213 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1214 {
1215   unsigned int res = 1;
1216   unsigned int i;
1217   rgroup_controls *rgm;
1218   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1219     res = MAX (res, rgm->max_nscalars_per_iter);
1220   return res;
1221 }
1222
1223 /* Calculate the minimum precision necessary to represent:
1224
1225       MAX_NITERS * FACTOR
1226
1227    as an unsigned integer, where MAX_NITERS is the maximum number of
1228    loop header iterations for the original scalar form of LOOP_VINFO.  */
1229
1230 static unsigned
1231 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1232 {
1233   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1234
1235   /* Get the maximum number of iterations that is representable
1236      in the counter type.  */
1237   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1238   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1239
1240   /* Get a more refined estimate for the number of iterations.  */
1241   widest_int max_back_edges;
1242   if (max_loop_iterations (loop, &max_back_edges))
1243     max_ni = wi::smin (max_ni, max_back_edges + 1);
1244
1245   /* Work out how many bits we need to represent the limit.  */
1246   return wi::min_precision (max_ni * factor, UNSIGNED);
1247 }
1248
1249 /* True if the loop needs peeling or partial vectors when vectorized.  */
1250
1251 static bool
1252 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1253 {
1254   unsigned HOST_WIDE_INT const_vf;
1255   HOST_WIDE_INT max_niter
1256     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1257
1258   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1259   if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1260     th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1261                                           (loop_vinfo));
1262
1263   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1264       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1265     {
1266       /* Work out the (constant) number of iterations that need to be
1267          peeled for reasons other than niters.  */
1268       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1269       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1270         peel_niter += 1;
1271       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1272                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1273         return true;
1274     }
1275   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1276       /* ??? When peeling for gaps but not alignment, we could
1277          try to check whether the (variable) niters is known to be
1278          VF * N + 1.  That's something of a niche case though.  */
1279       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1280       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1281       || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1282            < (unsigned) exact_log2 (const_vf))
1283           /* In case of versioning, check if the maximum number of
1284              iterations is greater than th.  If they are identical,
1285              the epilogue is unnecessary.  */
1286           && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1287               || ((unsigned HOST_WIDE_INT) max_niter
1288                   /* We'd like to use LOOP_VINFO_VERSIONING_THRESHOLD
1289                      but that's only computed later based on our result.
1290                      The following is the most conservative approximation.  */
1291                   > (std::max ((unsigned HOST_WIDE_INT) th,
1292                                const_vf) / const_vf) * const_vf))))
1293     return true;
1294
1295   return false;
1296 }
1297
1298 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1299    whether we can actually generate the masks required.  Return true if so,
1300    storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE.  */
1301
1302 static bool
1303 vect_verify_full_masking (loop_vec_info loop_vinfo)
1304 {
1305   unsigned int min_ni_width;
1306
1307   /* Use a normal loop if there are no statements that need masking.
1308      This only happens in rare degenerate cases: it means that the loop
1309      has no loads, no stores, and no live-out values.  */
1310   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1311     return false;
1312
1313   /* Produce the rgroup controls.  */
1314   for (auto mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1315     {
1316       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1317       tree vectype = mask.first;
1318       unsigned nvectors = mask.second;
1319
1320       if (masks->rgc_vec.length () < nvectors)
1321         masks->rgc_vec.safe_grow_cleared (nvectors, true);
1322       rgroup_controls *rgm = &(*masks).rgc_vec[nvectors - 1];
1323       /* The number of scalars per iteration and the number of vectors are
1324          both compile-time constants.  */
1325       unsigned int nscalars_per_iter
1326           = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1327                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1328
1329       if (rgm->max_nscalars_per_iter < nscalars_per_iter)
1330         {
1331           rgm->max_nscalars_per_iter = nscalars_per_iter;
1332           rgm->type = truth_type_for (vectype);
1333           rgm->factor = 1;
1334         }
1335     }
1336
1337   unsigned int max_nscalars_per_iter
1338     = vect_get_max_nscalars_per_iter (loop_vinfo);
1339
1340   /* Work out how many bits we need to represent the limit.  */
1341   min_ni_width
1342     = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1343
1344   /* Find a scalar mode for which WHILE_ULT is supported.  */
1345   opt_scalar_int_mode cmp_mode_iter;
1346   tree cmp_type = NULL_TREE;
1347   tree iv_type = NULL_TREE;
1348   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1349   unsigned int iv_precision = UINT_MAX;
1350
1351   if (iv_limit != -1)
1352     iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1353                                       UNSIGNED);
1354
1355   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1356     {
1357       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1358       if (cmp_bits >= min_ni_width
1359           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1360         {
1361           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1362           if (this_type
1363               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1364             {
1365               /* Although we could stop as soon as we find a valid mode,
1366                  there are at least two reasons why that's not always the
1367                  best choice:
1368
1369                  - An IV that's Pmode or wider is more likely to be reusable
1370                    in address calculations than an IV that's narrower than
1371                    Pmode.
1372
1373                  - Doing the comparison in IV_PRECISION or wider allows
1374                    a natural 0-based IV, whereas using a narrower comparison
1375                    type requires mitigations against wrap-around.
1376
1377                  Conversely, if the IV limit is variable, doing the comparison
1378                  in a wider type than the original type can introduce
1379                  unnecessary extensions, so picking the widest valid mode
1380                  is not always a good choice either.
1381
1382                  Here we prefer the first IV type that's Pmode or wider,
1383                  and the first comparison type that's IV_PRECISION or wider.
1384                  (The comparison type must be no wider than the IV type,
1385                  to avoid extensions in the vector loop.)
1386
1387                  ??? We might want to try continuing beyond Pmode for ILP32
1388                  targets if CMP_BITS < IV_PRECISION.  */
1389               iv_type = this_type;
1390               if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1391                 cmp_type = this_type;
1392               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1393                 break;
1394             }
1395         }
1396     }
1397
1398   if (!cmp_type)
1399     {
1400       LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.release ();
1401       return false;
1402     }
1403
1404   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1405   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1406   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_while_ult;
1407   return true;
1408 }
1409
1410 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1411    whether we can actually generate AVX512 style masks.  Return true if so,
1412    storing the type of the scalar IV in LOOP_VINFO_RGROUP_IV_TYPE.  */
1413
1414 static bool
1415 vect_verify_full_masking_avx512 (loop_vec_info loop_vinfo)
1416 {
1417   /* Produce differently organized rgc_vec and differently check
1418      we can produce masks.  */
1419
1420   /* Use a normal loop if there are no statements that need masking.
1421      This only happens in rare degenerate cases: it means that the loop
1422      has no loads, no stores, and no live-out values.  */
1423   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1424     return false;
1425
1426   /* For the decrementing IV we need to represent all values in
1427      [0, niter + niter_skip] where niter_skip is the elements we
1428      skip in the first iteration for prologue peeling.  */
1429   tree iv_type = NULL_TREE;
1430   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1431   unsigned int iv_precision = UINT_MAX;
1432   if (iv_limit != -1)
1433     iv_precision = wi::min_precision (iv_limit, UNSIGNED);
1434
1435   /* First compute the type for the IV we use to track the remaining
1436      scalar iterations.  */
1437   opt_scalar_int_mode cmp_mode_iter;
1438   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1439     {
1440       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1441       if (cmp_bits >= iv_precision
1442           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1443         {
1444           iv_type = build_nonstandard_integer_type (cmp_bits, true);
1445           if (iv_type)
1446             break;
1447         }
1448     }
1449   if (!iv_type)
1450     return false;
1451
1452   /* Produce the rgroup controls.  */
1453   for (auto const &mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1454     {
1455       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1456       tree vectype = mask.first;
1457       unsigned nvectors = mask.second;
1458
1459       /* The number of scalars per iteration and the number of vectors are
1460          both compile-time constants.  */
1461       unsigned int nscalars_per_iter
1462         = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1463                      LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1464
1465       /* We index the rgroup_controls vector with nscalars_per_iter
1466          which we keep constant and instead have a varying nvectors,
1467          remembering the vector mask with the fewest nV.  */
1468       if (masks->rgc_vec.length () < nscalars_per_iter)
1469         masks->rgc_vec.safe_grow_cleared (nscalars_per_iter, true);
1470       rgroup_controls *rgm = &(*masks).rgc_vec[nscalars_per_iter - 1];
1471
1472       if (!rgm->type || rgm->factor > nvectors)
1473         {
1474           rgm->type = truth_type_for (vectype);
1475           rgm->compare_type = NULL_TREE;
1476           rgm->max_nscalars_per_iter = nscalars_per_iter;
1477           rgm->factor = nvectors;
1478           rgm->bias_adjusted_ctrl = NULL_TREE;
1479         }
1480     }
1481
1482   /* There is no fixed compare type we are going to use but we have to
1483      be able to get at one for each mask group.  */
1484   unsigned int min_ni_width
1485     = wi::min_precision (vect_max_vf (loop_vinfo), UNSIGNED);
1486
1487   bool ok = true;
1488   for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
1489     {
1490       tree mask_type = rgc.type;
1491       if (!mask_type)
1492         continue;
1493
1494       /* For now vect_get_loop_mask only supports integer mode masks
1495          when we need to split it.  */
1496       if (GET_MODE_CLASS (TYPE_MODE (mask_type)) != MODE_INT
1497           || TYPE_PRECISION (TREE_TYPE (mask_type)) != 1)
1498         {
1499           ok = false;
1500           break;
1501         }
1502
1503       /* If iv_type is usable as compare type use that - we can elide the
1504          saturation in that case.   */
1505       if (TYPE_PRECISION (iv_type) >= min_ni_width)
1506         {
1507           tree cmp_vectype
1508             = build_vector_type (iv_type, TYPE_VECTOR_SUBPARTS (mask_type));
1509           if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1510             rgc.compare_type = cmp_vectype;
1511         }
1512       if (!rgc.compare_type)
1513         FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1514           {
1515             unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1516             if (cmp_bits >= min_ni_width
1517                 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1518               {
1519                 tree cmp_type = build_nonstandard_integer_type (cmp_bits, true);
1520                 if (!cmp_type)
1521                   continue;
1522
1523                 /* Check whether we can produce the mask with cmp_type.  */
1524                 tree cmp_vectype
1525                   = build_vector_type (cmp_type, TYPE_VECTOR_SUBPARTS (mask_type));
1526                 if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1527                   {
1528                     rgc.compare_type = cmp_vectype;
1529                     break;
1530                   }
1531               }
1532         }
1533       if (!rgc.compare_type)
1534         {
1535           ok = false;
1536           break;
1537         }
1538     }
1539   if (!ok)
1540     {
1541       release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
1542       return false;
1543     }
1544
1545   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = error_mark_node;
1546   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1547   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_avx512;
1548   return true;
1549 }
1550
1551 /* Check whether we can use vector access with length based on precison
1552    comparison.  So far, to keep it simple, we only allow the case that the
1553    precision of the target supported length is larger than the precision
1554    required by loop niters.  */
1555
1556 static bool
1557 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1558 {
1559   if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1560     return false;
1561
1562   machine_mode len_load_mode, len_store_mode;
1563   if (!get_len_load_store_mode (loop_vinfo->vector_mode, true)
1564          .exists (&len_load_mode))
1565     return false;
1566   if (!get_len_load_store_mode (loop_vinfo->vector_mode, false)
1567          .exists (&len_store_mode))
1568     return false;
1569
1570   signed char partial_load_bias = internal_len_load_store_bias
1571     (IFN_LEN_LOAD, len_load_mode);
1572
1573   signed char partial_store_bias = internal_len_load_store_bias
1574     (IFN_LEN_STORE, len_store_mode);
1575
1576   gcc_assert (partial_load_bias == partial_store_bias);
1577
1578   if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1579     return false;
1580
1581   /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1582      len_loads with a length of zero.  In order to avoid that we prohibit
1583      more than one loop length here.  */
1584   if (partial_load_bias == -1
1585       && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1586     return false;
1587
1588   LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1589
1590   unsigned int max_nitems_per_iter = 1;
1591   unsigned int i;
1592   rgroup_controls *rgl;
1593   /* Find the maximum number of items per iteration for every rgroup.  */
1594   FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1595     {
1596       unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1597       max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1598     }
1599
1600   /* Work out how many bits we need to represent the length limit.  */
1601   unsigned int min_ni_prec
1602     = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1603
1604   /* Now use the maximum of below precisions for one suitable IV type:
1605      - the IV's natural precision
1606      - the precision needed to hold: the maximum number of scalar
1607        iterations multiplied by the scale factor (min_ni_prec above)
1608      - the Pmode precision
1609
1610      If min_ni_prec is less than the precision of the current niters,
1611      we perfer to still use the niters type.  Prefer to use Pmode and
1612      wider IV to avoid narrow conversions.  */
1613
1614   unsigned int ni_prec
1615     = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1616   min_ni_prec = MAX (min_ni_prec, ni_prec);
1617   min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1618
1619   tree iv_type = NULL_TREE;
1620   opt_scalar_int_mode tmode_iter;
1621   FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1622     {
1623       scalar_mode tmode = tmode_iter.require ();
1624       unsigned int tbits = GET_MODE_BITSIZE (tmode);
1625
1626       /* ??? Do we really want to construct one IV whose precision exceeds
1627          BITS_PER_WORD?  */
1628       if (tbits > BITS_PER_WORD)
1629         break;
1630
1631       /* Find the first available standard integral type.  */
1632       if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1633         {
1634           iv_type = build_nonstandard_integer_type (tbits, true);
1635           break;
1636         }
1637     }
1638
1639   if (!iv_type)
1640     {
1641       if (dump_enabled_p ())
1642         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1643                          "can't vectorize with length-based partial vectors"
1644                          " because there is no suitable iv type.\n");
1645       return false;
1646     }
1647
1648   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1649   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1650   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_len;
1651
1652   return true;
1653 }
1654
1655 /* Calculate the cost of one scalar iteration of the loop.  */
1656 static void
1657 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1658 {
1659   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1660   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1661   int nbbs = loop->num_nodes, factor;
1662   int innerloop_iters, i;
1663
1664   DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1665
1666   /* Gather costs for statements in the scalar loop.  */
1667
1668   /* FORNOW.  */
1669   innerloop_iters = 1;
1670   if (loop->inner)
1671     innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1672
1673   for (i = 0; i < nbbs; i++)
1674     {
1675       gimple_stmt_iterator si;
1676       basic_block bb = bbs[i];
1677
1678       if (bb->loop_father == loop->inner)
1679         factor = innerloop_iters;
1680       else
1681         factor = 1;
1682
1683       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1684         {
1685           gimple *stmt = gsi_stmt (si);
1686           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1687
1688           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1689             continue;
1690
1691           /* Skip stmts that are not vectorized inside the loop.  */
1692           stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1693           if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1694               && (!STMT_VINFO_LIVE_P (vstmt_info)
1695                   || !VECTORIZABLE_CYCLE_DEF
1696                         (STMT_VINFO_DEF_TYPE (vstmt_info))))
1697             continue;
1698
1699           vect_cost_for_stmt kind;
1700           if (STMT_VINFO_DATA_REF (stmt_info))
1701             {
1702               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1703                kind = scalar_load;
1704              else
1705                kind = scalar_store;
1706             }
1707           else if (vect_nop_conversion_p (stmt_info))
1708             continue;
1709           else
1710             kind = scalar_stmt;
1711
1712           /* We are using vect_prologue here to avoid scaling twice
1713              by the inner loop factor.  */
1714           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1715                             factor, kind, stmt_info, 0, vect_prologue);
1716         }
1717     }
1718
1719   /* Now accumulate cost.  */
1720   loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1721   add_stmt_costs (loop_vinfo->scalar_costs,
1722                   &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1723   loop_vinfo->scalar_costs->finish_cost (nullptr);
1724 }
1725
1726 /* Function vect_analyze_loop_form.
1727
1728    Verify that certain CFG restrictions hold, including:
1729    - the loop has a pre-header
1730    - the loop has a single entry
1731    - nested loops can have only a single exit.
1732    - the loop exit condition is simple enough
1733    - the number of iterations can be analyzed, i.e, a countable loop.  The
1734      niter could be analyzed under some assumptions.  */
1735
1736 opt_result
1737 vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info)
1738 {
1739   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1740
1741   edge exit_e = vec_init_loop_exit_info (loop);
1742   if (!exit_e)
1743     return opt_result::failure_at (vect_location,
1744                                    "not vectorized:"
1745                                    " could not determine main exit from"
1746                                    " loop with multiple exits.\n");
1747   info->loop_exit = exit_e;
1748   if (dump_enabled_p ())
1749       dump_printf_loc (MSG_NOTE, vect_location,
1750                        "using as main loop exit: %d -> %d [AUX: %p]\n",
1751                        exit_e->src->index, exit_e->dest->index, exit_e->aux);
1752
1753   /* Check if we have any control flow that doesn't leave the loop.  */
1754   class loop *v_loop = loop->inner ? loop->inner : loop;
1755   basic_block *bbs = get_loop_body (v_loop);
1756   for (unsigned i = 0; i < v_loop->num_nodes; i++)
1757     if (EDGE_COUNT (bbs[i]->succs) != 1
1758         && (EDGE_COUNT (bbs[i]->succs) != 2
1759             || !loop_exits_from_bb_p (bbs[i]->loop_father, bbs[i])))
1760       {
1761         free (bbs);
1762         return opt_result::failure_at (vect_location,
1763                                        "not vectorized:"
1764                                        " unsupported control flow in loop.\n");
1765       }
1766   free (bbs);
1767
1768   /* Different restrictions apply when we are considering an inner-most loop,
1769      vs. an outer (nested) loop.
1770      (FORNOW. May want to relax some of these restrictions in the future).  */
1771
1772   info->inner_loop_cond = NULL;
1773   if (!loop->inner)
1774     {
1775       /* Inner-most loop.  */
1776
1777       if (empty_block_p (loop->header))
1778         return opt_result::failure_at (vect_location,
1779                                        "not vectorized: empty loop.\n");
1780     }
1781   else
1782     {
1783       class loop *innerloop = loop->inner;
1784       edge entryedge;
1785
1786       /* Nested loop. We currently require that the loop is doubly-nested,
1787          contains a single inner loop with a single exit to the block
1788          with the single exit condition in the outer loop.
1789          Vectorizable outer-loops look like this:
1790
1791                         (pre-header)
1792                            |
1793                           header <---+
1794                            |         |
1795                           inner-loop |
1796                            |         |
1797                           tail ------+
1798                            |
1799                         (exit-bb)
1800
1801          The inner-loop also has the properties expected of inner-most loops
1802          as described above.  */
1803
1804       if ((loop->inner)->inner || (loop->inner)->next)
1805         return opt_result::failure_at (vect_location,
1806                                        "not vectorized:"
1807                                        " multiple nested loops.\n");
1808
1809       entryedge = loop_preheader_edge (innerloop);
1810       if (entryedge->src != loop->header
1811           || !single_exit (innerloop)
1812           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1813         return opt_result::failure_at (vect_location,
1814                                        "not vectorized:"
1815                                        " unsupported outerloop form.\n");
1816
1817       /* Analyze the inner-loop.  */
1818       vect_loop_form_info inner;
1819       opt_result res = vect_analyze_loop_form (loop->inner, &inner);
1820       if (!res)
1821         {
1822           if (dump_enabled_p ())
1823             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1824                              "not vectorized: Bad inner loop.\n");
1825           return res;
1826         }
1827
1828       /* Don't support analyzing niter under assumptions for inner
1829          loop.  */
1830       if (!integer_onep (inner.assumptions))
1831         return opt_result::failure_at (vect_location,
1832                                        "not vectorized: Bad inner loop.\n");
1833
1834       if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1835         return opt_result::failure_at (vect_location,
1836                                        "not vectorized: inner-loop count not"
1837                                        " invariant.\n");
1838
1839       if (dump_enabled_p ())
1840         dump_printf_loc (MSG_NOTE, vect_location,
1841                          "Considering outer-loop vectorization.\n");
1842       info->inner_loop_cond = inner.conds[0];
1843     }
1844
1845   if (EDGE_COUNT (loop->header->preds) != 2)
1846     return opt_result::failure_at (vect_location,
1847                                    "not vectorized:"
1848                                    " too many incoming edges.\n");
1849
1850   /* We assume that the latch is empty.  */
1851   if (!empty_block_p (loop->latch)
1852       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1853     return opt_result::failure_at (vect_location,
1854                                    "not vectorized: latch block not empty.\n");
1855
1856   /* Make sure there is no abnormal exit.  */
1857   auto_vec<edge> exits = get_loop_exit_edges (loop);
1858   for (edge e : exits)
1859     {
1860       if (e->flags & EDGE_ABNORMAL)
1861         return opt_result::failure_at (vect_location,
1862                                        "not vectorized:"
1863                                        " abnormal loop exit edge.\n");
1864     }
1865
1866   info->conds
1867     = vect_get_loop_niters (loop, exit_e, &info->assumptions,
1868                             &info->number_of_iterations,
1869                             &info->number_of_iterationsm1);
1870   if (info->conds.is_empty ())
1871     return opt_result::failure_at
1872       (vect_location,
1873        "not vectorized: complicated exit condition.\n");
1874
1875   /* Determine what the primary and alternate exit conds are.  */
1876   for (unsigned i = 0; i < info->conds.length (); i++)
1877     {
1878       gcond *cond = info->conds[i];
1879       if (exit_e->src == gimple_bb (cond))
1880         std::swap (info->conds[0], info->conds[i]);
1881     }
1882
1883   if (integer_zerop (info->assumptions)
1884       || !info->number_of_iterations
1885       || chrec_contains_undetermined (info->number_of_iterations))
1886     return opt_result::failure_at
1887       (info->conds[0],
1888        "not vectorized: number of iterations cannot be computed.\n");
1889
1890   if (integer_zerop (info->number_of_iterations))
1891     return opt_result::failure_at
1892       (info->conds[0],
1893        "not vectorized: number of iterations = 0.\n");
1894
1895   if (!(tree_fits_shwi_p (info->number_of_iterations)
1896         && tree_to_shwi (info->number_of_iterations) > 0))
1897     {
1898       if (dump_enabled_p ())
1899         {
1900           dump_printf_loc (MSG_NOTE, vect_location,
1901                            "Symbolic number of iterations is ");
1902           dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1903           dump_printf (MSG_NOTE, "\n");
1904         }
1905     }
1906
1907   return opt_result::success ();
1908 }
1909
1910 /* Create a loop_vec_info for LOOP with SHARED and the
1911    vect_analyze_loop_form result.  */
1912
1913 loop_vec_info
1914 vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1915                         const vect_loop_form_info *info,
1916                         loop_vec_info main_loop_info)
1917 {
1918   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1919   LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1920   LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1921   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1922   LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = main_loop_info;
1923   /* Also record the assumptions for versioning.  */
1924   if (!integer_onep (info->assumptions) && !main_loop_info)
1925     LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1926
1927   for (gcond *cond : info->conds)
1928     {
1929       stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (cond);
1930       STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1931       /* Mark the statement as a condition.  */
1932       STMT_VINFO_DEF_TYPE (loop_cond_info) = vect_condition_def;
1933     }
1934
1935   for (unsigned i = 1; i < info->conds.length (); i ++)
1936     LOOP_VINFO_LOOP_CONDS (loop_vinfo).safe_push (info->conds[i]);
1937   LOOP_VINFO_LOOP_IV_COND (loop_vinfo) = info->conds[0];
1938
1939   LOOP_VINFO_IV_EXIT (loop_vinfo) = info->loop_exit;
1940
1941   /* Check to see if we're vectorizing multiple exits.  */
1942   LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
1943     = !LOOP_VINFO_LOOP_CONDS (loop_vinfo).is_empty ();
1944
1945   if (info->inner_loop_cond)
1946     {
1947       stmt_vec_info inner_loop_cond_info
1948         = loop_vinfo->lookup_stmt (info->inner_loop_cond);
1949       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1950       /* If we have an estimate on the number of iterations of the inner
1951          loop use that to limit the scale for costing, otherwise use
1952          --param vect-inner-loop-cost-factor literally.  */
1953       widest_int nit;
1954       if (estimated_stmt_executions (loop->inner, &nit))
1955         LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1956           = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1957     }
1958
1959   return loop_vinfo;
1960 }
1961
1962
1963
1964 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1965    statements update the vectorization factor.  */
1966
1967 static void
1968 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1969 {
1970   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1971   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1972   int nbbs = loop->num_nodes;
1973   poly_uint64 vectorization_factor;
1974   int i;
1975
1976   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1977
1978   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1979   gcc_assert (known_ne (vectorization_factor, 0U));
1980
1981   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1982      vectorization factor of the loop is the unrolling factor required by
1983      the SLP instances.  If that unrolling factor is 1, we say, that we
1984      perform pure SLP on loop - cross iteration parallelism is not
1985      exploited.  */
1986   bool only_slp_in_loop = true;
1987   for (i = 0; i < nbbs; i++)
1988     {
1989       basic_block bb = bbs[i];
1990       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1991            gsi_next (&si))
1992         {
1993           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1994           if (!stmt_info)
1995             continue;
1996           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1997                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1998               && !PURE_SLP_STMT (stmt_info))
1999             /* STMT needs both SLP and loop-based vectorization.  */
2000             only_slp_in_loop = false;
2001         }
2002       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
2003            gsi_next (&si))
2004         {
2005           if (is_gimple_debug (gsi_stmt (si)))
2006             continue;
2007           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2008           stmt_info = vect_stmt_to_vectorize (stmt_info);
2009           if ((STMT_VINFO_RELEVANT_P (stmt_info)
2010                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2011               && !PURE_SLP_STMT (stmt_info))
2012             /* STMT needs both SLP and loop-based vectorization.  */
2013             only_slp_in_loop = false;
2014         }
2015     }
2016
2017   if (only_slp_in_loop)
2018     {
2019       if (dump_enabled_p ())
2020         dump_printf_loc (MSG_NOTE, vect_location,
2021                          "Loop contains only SLP stmts\n");
2022       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
2023     }
2024   else
2025     {
2026       if (dump_enabled_p ())
2027         dump_printf_loc (MSG_NOTE, vect_location,
2028                          "Loop contains SLP and non-SLP stmts\n");
2029       /* Both the vectorization factor and unroll factor have the form
2030          GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
2031          so they must have a common multiple.  */
2032       vectorization_factor
2033         = force_common_multiple (vectorization_factor,
2034                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
2035     }
2036
2037   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
2038   if (dump_enabled_p ())
2039     {
2040       dump_printf_loc (MSG_NOTE, vect_location,
2041                        "Updating vectorization factor to ");
2042       dump_dec (MSG_NOTE, vectorization_factor);
2043       dump_printf (MSG_NOTE, ".\n");
2044     }
2045 }
2046
2047 /* Return true if STMT_INFO describes a double reduction phi and if
2048    the other phi in the reduction is also relevant for vectorization.
2049    This rejects cases such as:
2050
2051       outer1:
2052         x_1 = PHI <x_3(outer2), ...>;
2053         ...
2054
2055       inner:
2056         x_2 = ...;
2057         ...
2058
2059       outer2:
2060         x_3 = PHI <x_2(inner)>;
2061
2062    if nothing in x_2 or elsewhere makes x_1 relevant.  */
2063
2064 static bool
2065 vect_active_double_reduction_p (stmt_vec_info stmt_info)
2066 {
2067   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2068     return false;
2069
2070   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
2071 }
2072
2073 /* Function vect_analyze_loop_operations.
2074
2075    Scan the loop stmts and make sure they are all vectorizable.  */
2076
2077 static opt_result
2078 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
2079 {
2080   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2081   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2082   int nbbs = loop->num_nodes;
2083   int i;
2084   stmt_vec_info stmt_info;
2085   bool need_to_vectorize = false;
2086   bool ok;
2087
2088   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
2089
2090   auto_vec<stmt_info_for_cost> cost_vec;
2091
2092   for (i = 0; i < nbbs; i++)
2093     {
2094       basic_block bb = bbs[i];
2095
2096       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
2097            gsi_next (&si))
2098         {
2099           gphi *phi = si.phi ();
2100           ok = true;
2101
2102           stmt_info = loop_vinfo->lookup_stmt (phi);
2103           if (dump_enabled_p ())
2104             dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G",
2105                              (gimple *) phi);
2106           if (virtual_operand_p (gimple_phi_result (phi)))
2107             continue;
2108
2109           /* Inner-loop loop-closed exit phi in outer-loop vectorization
2110              (i.e., a phi in the tail of the outer-loop).  */
2111           if (! is_loop_header_bb_p (bb))
2112             {
2113               /* FORNOW: we currently don't support the case that these phis
2114                  are not used in the outerloop (unless it is double reduction,
2115                  i.e., this phi is vect_reduction_def), cause this case
2116                  requires to actually do something here.  */
2117               if (STMT_VINFO_LIVE_P (stmt_info)
2118                   && !vect_active_double_reduction_p (stmt_info))
2119                 return opt_result::failure_at (phi,
2120                                                "Unsupported loop-closed phi"
2121                                                " in outer-loop.\n");
2122
2123               /* If PHI is used in the outer loop, we check that its operand
2124                  is defined in the inner loop.  */
2125               if (STMT_VINFO_RELEVANT_P (stmt_info))
2126                 {
2127                   tree phi_op;
2128
2129                   if (gimple_phi_num_args (phi) != 1)
2130                     return opt_result::failure_at (phi, "unsupported phi");
2131
2132                   phi_op = PHI_ARG_DEF (phi, 0);
2133                   stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
2134                   if (!op_def_info)
2135                     return opt_result::failure_at (phi, "unsupported phi\n");
2136
2137                   if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
2138                       && (STMT_VINFO_RELEVANT (op_def_info)
2139                           != vect_used_in_outer_by_reduction))
2140                     return opt_result::failure_at (phi, "unsupported phi\n");
2141
2142                   if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2143                        || (STMT_VINFO_DEF_TYPE (stmt_info)
2144                            == vect_double_reduction_def))
2145                       && !vectorizable_lc_phi (loop_vinfo,
2146                                                stmt_info, NULL, NULL))
2147                     return opt_result::failure_at (phi, "unsupported phi\n");
2148                 }
2149
2150               continue;
2151             }
2152
2153           gcc_assert (stmt_info);
2154
2155           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
2156                || STMT_VINFO_LIVE_P (stmt_info))
2157               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def
2158               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
2159             /* A scalar-dependence cycle that we don't support.  */
2160             return opt_result::failure_at (phi,
2161                                            "not vectorized:"
2162                                            " scalar dependence cycle.\n");
2163
2164           if (STMT_VINFO_RELEVANT_P (stmt_info))
2165             {
2166               need_to_vectorize = true;
2167               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
2168                   && ! PURE_SLP_STMT (stmt_info))
2169                 ok = vectorizable_induction (loop_vinfo,
2170                                              stmt_info, NULL, NULL,
2171                                              &cost_vec);
2172               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2173                         || (STMT_VINFO_DEF_TYPE (stmt_info)
2174                             == vect_double_reduction_def)
2175                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
2176                        && ! PURE_SLP_STMT (stmt_info))
2177                 ok = vectorizable_reduction (loop_vinfo,
2178                                              stmt_info, NULL, NULL, &cost_vec);
2179               else if ((STMT_VINFO_DEF_TYPE (stmt_info)
2180                         == vect_first_order_recurrence)
2181                        && ! PURE_SLP_STMT (stmt_info))
2182                 ok = vectorizable_recurr (loop_vinfo, stmt_info, NULL, NULL,
2183                                            &cost_vec);
2184             }
2185
2186           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
2187           if (ok
2188               && STMT_VINFO_LIVE_P (stmt_info)
2189               && !PURE_SLP_STMT (stmt_info))
2190             ok = vectorizable_live_operation (loop_vinfo, stmt_info, NULL, NULL,
2191                                               -1, false, &cost_vec);
2192
2193           if (!ok)
2194             return opt_result::failure_at (phi,
2195                                            "not vectorized: relevant phi not "
2196                                            "supported: %G",
2197                                            static_cast <gimple *> (phi));
2198         }
2199
2200       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
2201            gsi_next (&si))
2202         {
2203           gimple *stmt = gsi_stmt (si);
2204           if (!gimple_clobber_p (stmt)
2205               && !is_gimple_debug (stmt))
2206             {
2207               opt_result res
2208                 = vect_analyze_stmt (loop_vinfo,
2209                                      loop_vinfo->lookup_stmt (stmt),
2210                                      &need_to_vectorize,
2211                                      NULL, NULL, &cost_vec);
2212               if (!res)
2213                 return res;
2214             }
2215         }
2216     } /* bbs */
2217
2218   add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
2219
2220   /* All operations in the loop are either irrelevant (deal with loop
2221      control, or dead), or only used outside the loop and can be moved
2222      out of the loop (e.g. invariants, inductions).  The loop can be
2223      optimized away by scalar optimizations.  We're better off not
2224      touching this loop.  */
2225   if (!need_to_vectorize)
2226     {
2227       if (dump_enabled_p ())
2228         dump_printf_loc (MSG_NOTE, vect_location,
2229                          "All the computation can be taken out of the loop.\n");
2230       return opt_result::failure_at
2231         (vect_location,
2232          "not vectorized: redundant loop. no profit to vectorize.\n");
2233     }
2234
2235   return opt_result::success ();
2236 }
2237
2238 /* Return true if we know that the iteration count is smaller than the
2239    vectorization factor.  Return false if it isn't, or if we can't be sure
2240    either way.  */
2241
2242 static bool
2243 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
2244 {
2245   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2246
2247   HOST_WIDE_INT max_niter;
2248   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2249     max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
2250   else
2251     max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2252
2253   if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
2254     return true;
2255
2256   return false;
2257 }
2258
2259 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
2260    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
2261    definitely no, or -1 if it's worth retrying.  */
2262
2263 static int
2264 vect_analyze_loop_costing (loop_vec_info loop_vinfo,
2265                            unsigned *suggested_unroll_factor)
2266 {
2267   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2268   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2269
2270   /* Only loops that can handle partially-populated vectors can have iteration
2271      counts less than the vectorization factor.  */
2272   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2273       && vect_known_niters_smaller_than_vf (loop_vinfo))
2274     {
2275       if (dump_enabled_p ())
2276         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2277                          "not vectorized: iteration count smaller than "
2278                          "vectorization factor.\n");
2279       return 0;
2280     }
2281
2282   /* If we know the number of iterations we can do better, for the
2283      epilogue we can also decide whether the main loop leaves us
2284      with enough iterations, prefering a smaller vector epilog then
2285      also possibly used for the case we skip the vector loop.  */
2286   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2287     {
2288       widest_int scalar_niters
2289         = wi::to_widest (LOOP_VINFO_NITERSM1 (loop_vinfo)) + 1;
2290       if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2291         {
2292           loop_vec_info orig_loop_vinfo
2293             = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2294           unsigned lowest_vf
2295             = constant_lower_bound (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo));
2296           int prolog_peeling = 0;
2297           if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2298             prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo);
2299           if (prolog_peeling >= 0
2300               && known_eq (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
2301                            lowest_vf))
2302             {
2303               unsigned gap
2304                 = LOOP_VINFO_PEELING_FOR_GAPS (orig_loop_vinfo) ? 1 : 0;
2305               scalar_niters = ((scalar_niters - gap - prolog_peeling)
2306                                % lowest_vf + gap);
2307             }
2308         }
2309       /* Reject vectorizing for a single scalar iteration, even if
2310          we could in principle implement that using partial vectors.  */
2311       unsigned peeling_gap = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2312       if (scalar_niters <= peeling_gap + 1)
2313         {
2314           if (dump_enabled_p ())
2315             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2316                              "not vectorized: loop only has a single "
2317                              "scalar iteration.\n");
2318           return 0;
2319         }
2320
2321       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2322         {
2323           /* Check that the loop processes at least one full vector.  */
2324           poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2325           if (known_lt (scalar_niters, vf))
2326             {
2327               if (dump_enabled_p ())
2328                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2329                                  "loop does not have enough iterations "
2330                                  "to support vectorization.\n");
2331               return 0;
2332             }
2333
2334           /* If we need to peel an extra epilogue iteration to handle data
2335              accesses with gaps, check that there are enough scalar iterations
2336              available.
2337
2338              The check above is redundant with this one when peeling for gaps,
2339              but the distinction is useful for diagnostics.  */
2340           if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2341               && known_le (scalar_niters, vf))
2342             {
2343               if (dump_enabled_p ())
2344                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2345                                  "loop does not have enough iterations "
2346                                  "to support peeling for gaps.\n");
2347               return 0;
2348             }
2349         }
2350     }
2351
2352   /* If using the "very cheap" model. reject cases in which we'd keep
2353      a copy of the scalar code (even if we might be able to vectorize it).  */
2354   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2355       && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2356           || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2357           || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
2358     {
2359       if (dump_enabled_p ())
2360         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2361                          "some scalar iterations would need to be peeled\n");
2362       return 0;
2363     }
2364
2365   int min_profitable_iters, min_profitable_estimate;
2366   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2367                                       &min_profitable_estimate,
2368                                       suggested_unroll_factor);
2369
2370   if (min_profitable_iters < 0)
2371     {
2372       if (dump_enabled_p ())
2373         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2374                          "not vectorized: vectorization not profitable.\n");
2375       if (dump_enabled_p ())
2376         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2377                          "not vectorized: vector version will never be "
2378                          "profitable.\n");
2379       return -1;
2380     }
2381
2382   int min_scalar_loop_bound = (param_min_vect_loop_bound
2383                                * assumed_vf);
2384
2385   /* Use the cost model only if it is more conservative than user specified
2386      threshold.  */
2387   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
2388                                     min_profitable_iters);
2389
2390   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2391
2392   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2393       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2394     {
2395       if (dump_enabled_p ())
2396         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2397                          "not vectorized: vectorization not profitable.\n");
2398       if (dump_enabled_p ())
2399         dump_printf_loc (MSG_NOTE, vect_location,
2400                          "not vectorized: iteration count smaller than user "
2401                          "specified loop bound parameter or minimum profitable "
2402                          "iterations (whichever is more conservative).\n");
2403       return 0;
2404     }
2405
2406   /* The static profitablity threshold min_profitable_estimate includes
2407      the cost of having to check at runtime whether the scalar loop
2408      should be used instead.  If it turns out that we don't need or want
2409      such a check, the threshold we should use for the static estimate
2410      is simply the point at which the vector loop becomes more profitable
2411      than the scalar loop.  */
2412   if (min_profitable_estimate > min_profitable_iters
2413       && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
2414       && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2415       && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2416       && !vect_apply_runtime_profitability_check_p (loop_vinfo))
2417     {
2418       if (dump_enabled_p ())
2419         dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
2420                          " choice between the scalar and vector loops\n");
2421       min_profitable_estimate = min_profitable_iters;
2422     }
2423
2424   /* If the vector loop needs multiple iterations to be beneficial then
2425      things are probably too close to call, and the conservative thing
2426      would be to stick with the scalar code.  */
2427   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2428       && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
2429     {
2430       if (dump_enabled_p ())
2431         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2432                          "one iteration of the vector loop would be"
2433                          " more expensive than the equivalent number of"
2434                          " iterations of the scalar loop\n");
2435       return 0;
2436     }
2437
2438   HOST_WIDE_INT estimated_niter;
2439
2440   /* If we are vectorizing an epilogue then we know the maximum number of
2441      scalar iterations it will cover is at least one lower than the
2442      vectorization factor of the main loop.  */
2443   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2444     estimated_niter
2445       = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
2446   else
2447     {
2448       estimated_niter = estimated_stmt_executions_int (loop);
2449       if (estimated_niter == -1)
2450         estimated_niter = likely_max_stmt_executions_int (loop);
2451     }
2452   if (estimated_niter != -1
2453       && ((unsigned HOST_WIDE_INT) estimated_niter
2454           < MAX (th, (unsigned) min_profitable_estimate)))
2455     {
2456       if (dump_enabled_p ())
2457         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2458                          "not vectorized: estimated iteration count too "
2459                          "small.\n");
2460       if (dump_enabled_p ())
2461         dump_printf_loc (MSG_NOTE, vect_location,
2462                          "not vectorized: estimated iteration count smaller "
2463                          "than specified loop bound parameter or minimum "
2464                          "profitable iterations (whichever is more "
2465                          "conservative).\n");
2466       return -1;
2467     }
2468
2469   return 1;
2470 }
2471
2472 static opt_result
2473 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
2474                            vec<data_reference_p> *datarefs,
2475                            unsigned int *n_stmts)
2476 {
2477   *n_stmts = 0;
2478   for (unsigned i = 0; i < loop->num_nodes; i++)
2479     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2480          !gsi_end_p (gsi); gsi_next (&gsi))
2481       {
2482         gimple *stmt = gsi_stmt (gsi);
2483         if (is_gimple_debug (stmt))
2484           continue;
2485         ++(*n_stmts);
2486         opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
2487                                                         NULL, 0);
2488         if (!res)
2489           {
2490             if (is_gimple_call (stmt) && loop->safelen)
2491               {
2492                 tree fndecl = gimple_call_fndecl (stmt), op;
2493                 if (fndecl == NULL_TREE
2494                     && gimple_call_internal_p (stmt, IFN_MASK_CALL))
2495                   {
2496                     fndecl = gimple_call_arg (stmt, 0);
2497                     gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
2498                     fndecl = TREE_OPERAND (fndecl, 0);
2499                     gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
2500                   }
2501                 if (fndecl != NULL_TREE)
2502                   {
2503                     cgraph_node *node = cgraph_node::get (fndecl);
2504                     if (node != NULL && node->simd_clones != NULL)
2505                       {
2506                         unsigned int j, n = gimple_call_num_args (stmt);
2507                         for (j = 0; j < n; j++)
2508                           {
2509                             op = gimple_call_arg (stmt, j);
2510                             if (DECL_P (op)
2511                                 || (REFERENCE_CLASS_P (op)
2512                                     && get_base_address (op)))
2513                               break;
2514                           }
2515                         op = gimple_call_lhs (stmt);
2516                         /* Ignore #pragma omp declare simd functions
2517                            if they don't have data references in the
2518                            call stmt itself.  */
2519                         if (j == n
2520                             && !(op
2521                                  && (DECL_P (op)
2522                                      || (REFERENCE_CLASS_P (op)
2523                                          && get_base_address (op)))))
2524                           continue;
2525                       }
2526                   }
2527               }
2528             return res;
2529           }
2530         /* If dependence analysis will give up due to the limit on the
2531            number of datarefs stop here and fail fatally.  */
2532         if (datarefs->length ()
2533             > (unsigned)param_loop_max_datarefs_for_datadeps)
2534           return opt_result::failure_at (stmt, "exceeded param "
2535                                          "loop-max-datarefs-for-datadeps\n");
2536       }
2537   return opt_result::success ();
2538 }
2539
2540 /* Look for SLP-only access groups and turn each individual access into its own
2541    group.  */
2542 static void
2543 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2544 {
2545   unsigned int i;
2546   struct data_reference *dr;
2547
2548   DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2549
2550   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2551   FOR_EACH_VEC_ELT (datarefs, i, dr)
2552     {
2553       gcc_assert (DR_REF (dr));
2554       stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
2555
2556       /* Check if the load is a part of an interleaving chain.  */
2557       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2558         {
2559           stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2560           dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2561           unsigned int group_size = DR_GROUP_SIZE (first_element);
2562
2563           /* Check if SLP-only groups.  */
2564           if (!STMT_SLP_TYPE (stmt_info)
2565               && STMT_VINFO_SLP_VECT_ONLY (first_element))
2566             {
2567               /* Dissolve the group.  */
2568               STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2569
2570               stmt_vec_info vinfo = first_element;
2571               while (vinfo)
2572                 {
2573                   stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2574                   DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2575                   DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2576                   DR_GROUP_SIZE (vinfo) = 1;
2577                   if (STMT_VINFO_STRIDED_P (first_element)
2578                       /* We cannot handle stores with gaps.  */
2579                       || DR_IS_WRITE (dr_info->dr))
2580                     {
2581                       STMT_VINFO_STRIDED_P (vinfo) = true;
2582                       DR_GROUP_GAP (vinfo) = 0;
2583                     }
2584                   else
2585                     DR_GROUP_GAP (vinfo) = group_size - 1;
2586                   /* Duplicate and adjust alignment info, it needs to
2587                      be present on each group leader, see dr_misalignment.  */
2588                   if (vinfo != first_element)
2589                     {
2590                       dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2591                       dr_info2->target_alignment = dr_info->target_alignment;
2592                       int misalignment = dr_info->misalignment;
2593                       if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2594                         {
2595                           HOST_WIDE_INT diff
2596                             = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2597                                - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2598                           unsigned HOST_WIDE_INT align_c
2599                             = dr_info->target_alignment.to_constant ();
2600                           misalignment = (misalignment + diff) % align_c;
2601                         }
2602                       dr_info2->misalignment = misalignment;
2603                     }
2604                   vinfo = next;
2605                 }
2606             }
2607         }
2608     }
2609 }
2610
2611 /* Determine if operating on full vectors for LOOP_VINFO might leave
2612    some scalar iterations still to do.  If so, decide how we should
2613    handle those scalar iterations.  The possibilities are:
2614
2615    (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2616        In this case:
2617
2618          LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2619          LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2620          LOOP_VINFO_PEELING_FOR_NITER == false
2621
2622    (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2623        to handle the remaining scalar iterations.  In this case:
2624
2625          LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2626          LOOP_VINFO_PEELING_FOR_NITER == true
2627
2628        There are two choices:
2629
2630        (2a) Consider vectorizing the epilogue loop at the same VF as the
2631             main loop, but using partial vectors instead of full vectors.
2632             In this case:
2633
2634               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2635
2636        (2b) Consider vectorizing the epilogue loop at lower VFs only.
2637             In this case:
2638
2639               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2640  */
2641
2642 opt_result
2643 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo)
2644 {
2645   /* Determine whether there would be any scalar iterations left over.  */
2646   bool need_peeling_or_partial_vectors_p
2647     = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2648
2649   /* Decide whether to vectorize the loop with partial vectors.  */
2650   LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2651   LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2652   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2653       && need_peeling_or_partial_vectors_p)
2654     {
2655       /* For partial-vector-usage=1, try to push the handling of partial
2656          vectors to the epilogue, with the main loop continuing to operate
2657          on full vectors.
2658
2659          If we are unrolling we also do not want to use partial vectors. This
2660          is to avoid the overhead of generating multiple masks and also to
2661          avoid having to execute entire iterations of FALSE masked instructions
2662          when dealing with one or less full iterations.
2663
2664          ??? We could then end up failing to use partial vectors if we
2665          decide to peel iterations into a prologue, and if the main loop
2666          then ends up processing fewer than VF iterations.  */
2667       if ((param_vect_partial_vector_usage == 1
2668            || loop_vinfo->suggested_unroll_factor > 1)
2669           && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2670           && !vect_known_niters_smaller_than_vf (loop_vinfo))
2671         LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2672       else
2673         LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2674     }
2675
2676   if (dump_enabled_p ())
2677     dump_printf_loc (MSG_NOTE, vect_location,
2678                      "operating on %s vectors%s.\n",
2679                      LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2680                      ? "partial" : "full",
2681                      LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2682                      ? " for epilogue loop" : "");
2683
2684   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2685     = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2686        && need_peeling_or_partial_vectors_p);
2687
2688   /* We set LOOP_VINFO_USING_SELECT_VL_P as true before loop vectorization
2689      analysis that we don't know whether the loop is vectorized by partial
2690      vectors (More details see tree-vect-loop-manip.cc).
2691
2692      However, SELECT_VL vectorizaton style should only applied on partial
2693      vectorization since SELECT_VL is the GIMPLE IR that calculates the
2694      number of elements to be process for each iteration.
2695
2696      After loop vectorization analysis, Clear LOOP_VINFO_USING_SELECT_VL_P
2697      if it is not partial vectorized loop.  */
2698   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2699     LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = false;
2700
2701   return opt_result::success ();
2702 }
2703
2704 /* Function vect_analyze_loop_2.
2705
2706    Apply a set of analyses on LOOP specified by LOOP_VINFO, the different
2707    analyses will record information in some members of LOOP_VINFO.  FATAL
2708    indicates if some analysis meets fatal error.  If one non-NULL pointer
2709    SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one
2710    worked out suggested unroll factor, while one NULL pointer shows it's
2711    going to apply the suggested unroll factor.  SLP_DONE_FOR_SUGGESTED_UF
2712    is to hold the slp decision when the suggested unroll factor is worked
2713    out.  */
2714 static opt_result
2715 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
2716                      unsigned *suggested_unroll_factor,
2717                      bool& slp_done_for_suggested_uf)
2718 {
2719   opt_result ok = opt_result::success ();
2720   int res;
2721   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2722   poly_uint64 min_vf = 2;
2723   loop_vec_info orig_loop_vinfo = NULL;
2724
2725   /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2726      loop_vec_info of the first vectorized loop.  */
2727   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2728     orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2729   else
2730     orig_loop_vinfo = loop_vinfo;
2731   gcc_assert (orig_loop_vinfo);
2732
2733   /* The first group of checks is independent of the vector size.  */
2734   fatal = true;
2735
2736   if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2737       && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2738     return opt_result::failure_at (vect_location,
2739                                    "not vectorized: simd if(0)\n");
2740
2741   /* Find all data references in the loop (which correspond to vdefs/vuses)
2742      and analyze their evolution in the loop.  */
2743
2744   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2745
2746   /* Gather the data references and count stmts in the loop.  */
2747   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2748     {
2749       opt_result res
2750         = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2751                                      &LOOP_VINFO_DATAREFS (loop_vinfo),
2752                                      &LOOP_VINFO_N_STMTS (loop_vinfo));
2753       if (!res)
2754         {
2755           if (dump_enabled_p ())
2756             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2757                              "not vectorized: loop contains function "
2758                              "calls or data references that cannot "
2759                              "be analyzed\n");
2760           return res;
2761         }
2762       loop_vinfo->shared->save_datarefs ();
2763     }
2764   else
2765     loop_vinfo->shared->check_datarefs ();
2766
2767   /* Analyze the data references and also adjust the minimal
2768      vectorization factor according to the loads and stores.  */
2769
2770   ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2771   if (!ok)
2772     {
2773       if (dump_enabled_p ())
2774         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2775                          "bad data references.\n");
2776       return ok;
2777     }
2778
2779   /* Check if we are applying unroll factor now.  */
2780   bool applying_suggested_uf = loop_vinfo->suggested_unroll_factor > 1;
2781   gcc_assert (!applying_suggested_uf || !suggested_unroll_factor);
2782
2783   /* If the slp decision is false when suggested unroll factor is worked
2784      out, and we are applying suggested unroll factor, we can simply skip
2785      all slp related analyses this time.  */
2786   bool slp = !applying_suggested_uf || slp_done_for_suggested_uf;
2787
2788   /* Classify all cross-iteration scalar data-flow cycles.
2789      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
2790   vect_analyze_scalar_cycles (loop_vinfo, slp);
2791
2792   vect_pattern_recog (loop_vinfo);
2793
2794   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2795
2796   /* Analyze the access patterns of the data-refs in the loop (consecutive,
2797      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
2798
2799   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2800   if (!ok)
2801     {
2802       if (dump_enabled_p ())
2803         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2804                          "bad data access.\n");
2805       return ok;
2806     }
2807
2808   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
2809
2810   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2811   if (!ok)
2812     {
2813       if (dump_enabled_p ())
2814         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2815                          "unexpected pattern.\n");
2816       return ok;
2817     }
2818
2819   /* While the rest of the analysis below depends on it in some way.  */
2820   fatal = false;
2821
2822   /* Analyze data dependences between the data-refs in the loop
2823      and adjust the maximum vectorization factor according to
2824      the dependences.
2825      FORNOW: fail at the first data dependence that we encounter.  */
2826
2827   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2828   if (!ok)
2829     {
2830       if (dump_enabled_p ())
2831         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2832                          "bad data dependence.\n");
2833       return ok;
2834     }
2835   if (max_vf != MAX_VECTORIZATION_FACTOR
2836       && maybe_lt (max_vf, min_vf))
2837     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2838   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2839
2840   ok = vect_determine_vectorization_factor (loop_vinfo);
2841   if (!ok)
2842     {
2843       if (dump_enabled_p ())
2844         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2845                          "can't determine vectorization factor.\n");
2846       return ok;
2847     }
2848
2849   /* Compute the scalar iteration cost.  */
2850   vect_compute_single_scalar_iteration_cost (loop_vinfo);
2851
2852   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2853
2854   if (slp)
2855     {
2856       /* Check the SLP opportunities in the loop, analyze and build
2857          SLP trees.  */
2858       ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo));
2859       if (!ok)
2860         return ok;
2861
2862       /* If there are any SLP instances mark them as pure_slp.  */
2863       slp = vect_make_slp_decision (loop_vinfo);
2864       if (slp)
2865         {
2866           /* Find stmts that need to be both vectorized and SLPed.  */
2867           vect_detect_hybrid_slp (loop_vinfo);
2868
2869           /* Update the vectorization factor based on the SLP decision.  */
2870           vect_update_vf_for_slp (loop_vinfo);
2871
2872           /* Optimize the SLP graph with the vectorization factor fixed.  */
2873           vect_optimize_slp (loop_vinfo);
2874
2875           /* Gather the loads reachable from the SLP graph entries.  */
2876           vect_gather_slp_loads (loop_vinfo);
2877         }
2878     }
2879
2880   bool saved_can_use_partial_vectors_p
2881     = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2882
2883   /* We don't expect to have to roll back to anything other than an empty
2884      set of rgroups.  */
2885   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2886
2887   /* This is the point where we can re-start analysis with SLP forced off.  */
2888 start_over:
2889
2890   /* Apply the suggested unrolling factor, this was determined by the backend
2891      during finish_cost the first time we ran the analyzis for this
2892      vector mode.  */
2893   if (applying_suggested_uf)
2894     LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2895
2896   /* Now the vectorization factor is final.  */
2897   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2898   gcc_assert (known_ne (vectorization_factor, 0U));
2899
2900   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2901     {
2902       dump_printf_loc (MSG_NOTE, vect_location,
2903                        "vectorization_factor = ");
2904       dump_dec (MSG_NOTE, vectorization_factor);
2905       dump_printf (MSG_NOTE, ", niters = %wd\n",
2906                    LOOP_VINFO_INT_NITERS (loop_vinfo));
2907     }
2908
2909   if (max_vf != MAX_VECTORIZATION_FACTOR
2910       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2911     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2912
2913   loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2914
2915   /* Analyze the alignment of the data-refs in the loop.
2916      Fail if a data reference is found that cannot be vectorized.  */
2917
2918   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2919   if (!ok)
2920     {
2921       if (dump_enabled_p ())
2922         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2923                          "bad data alignment.\n");
2924       return ok;
2925     }
2926
2927   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2928      It is important to call pruning after vect_analyze_data_ref_accesses,
2929      since we use grouping information gathered by interleaving analysis.  */
2930   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2931   if (!ok)
2932     return ok;
2933
2934   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2935      vectorization, since we do not want to add extra peeling or
2936      add versioning for alignment.  */
2937   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2938     /* This pass will decide on using loop versioning and/or loop peeling in
2939        order to enhance the alignment of data references in the loop.  */
2940     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2941   if (!ok)
2942     return ok;
2943
2944   if (slp)
2945     {
2946       /* Analyze operations in the SLP instances.  Note this may
2947          remove unsupported SLP instances which makes the above
2948          SLP kind detection invalid.  */
2949       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2950       vect_slp_analyze_operations (loop_vinfo);
2951       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2952         {
2953           ok = opt_result::failure_at (vect_location,
2954                                        "unsupported SLP instances\n");
2955           goto again;
2956         }
2957
2958       /* Check whether any load in ALL SLP instances is possibly permuted.  */
2959       slp_tree load_node, slp_root;
2960       unsigned i, x;
2961       slp_instance instance;
2962       bool can_use_lanes = true;
2963       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2964         {
2965           slp_root = SLP_INSTANCE_TREE (instance);
2966           int group_size = SLP_TREE_LANES (slp_root);
2967           tree vectype = SLP_TREE_VECTYPE (slp_root);
2968           bool loads_permuted = false;
2969           FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2970             {
2971               if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2972                 continue;
2973               unsigned j;
2974               stmt_vec_info load_info;
2975               FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2976                 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2977                   {
2978                     loads_permuted = true;
2979                     break;
2980                   }
2981             }
2982
2983           /* If the loads and stores can be handled with load/store-lane
2984              instructions record it and move on to the next instance.  */
2985           if (loads_permuted
2986               && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2987               && vect_store_lanes_supported (vectype, group_size, false)
2988                    != IFN_LAST)
2989             {
2990               FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2991                 if (STMT_VINFO_GROUPED_ACCESS
2992                       (SLP_TREE_REPRESENTATIVE (load_node)))
2993                   {
2994                     stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2995                         (SLP_TREE_REPRESENTATIVE (load_node));
2996                     /* Use SLP for strided accesses (or if we can't
2997                        load-lanes).  */
2998                     if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2999                         || vect_load_lanes_supported
3000                              (STMT_VINFO_VECTYPE (stmt_vinfo),
3001                               DR_GROUP_SIZE (stmt_vinfo), false) == IFN_LAST)
3002                       break;
3003                   }
3004
3005               can_use_lanes
3006                 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
3007
3008               if (can_use_lanes && dump_enabled_p ())
3009                 dump_printf_loc (MSG_NOTE, vect_location,
3010                                  "SLP instance %p can use load/store-lanes\n",
3011                                  (void *) instance);
3012             }
3013           else
3014             {
3015               can_use_lanes = false;
3016               break;
3017             }
3018         }
3019
3020       /* If all SLP instances can use load/store-lanes abort SLP and try again
3021          with SLP disabled.  */
3022       if (can_use_lanes)
3023         {
3024           ok = opt_result::failure_at (vect_location,
3025                                        "Built SLP cancelled: can use "
3026                                        "load/store-lanes\n");
3027           if (dump_enabled_p ())
3028             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3029                              "Built SLP cancelled: all SLP instances support "
3030                              "load/store-lanes\n");
3031           goto again;
3032         }
3033     }
3034
3035   /* Dissolve SLP-only groups.  */
3036   vect_dissolve_slp_only_groups (loop_vinfo);
3037
3038   /* Scan all the remaining operations in the loop that are not subject
3039      to SLP and make sure they are vectorizable.  */
3040   ok = vect_analyze_loop_operations (loop_vinfo);
3041   if (!ok)
3042     {
3043       if (dump_enabled_p ())
3044         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3045                          "bad operation or unsupported loop bound.\n");
3046       return ok;
3047     }
3048
3049   /* For now, we don't expect to mix both masking and length approaches for one
3050      loop, disable it if both are recorded.  */
3051   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3052       && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
3053       && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
3054     {
3055       if (dump_enabled_p ())
3056         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3057                          "can't vectorize a loop with partial vectors"
3058                          " because we don't expect to mix different"
3059                          " approaches with partial vectors for the"
3060                          " same loop.\n");
3061       LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3062     }
3063
3064   /* If we still have the option of using partial vectors,
3065      check whether we can generate the necessary loop controls.  */
3066   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
3067     {
3068       if (!LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
3069         {
3070           if (!vect_verify_full_masking (loop_vinfo)
3071               && !vect_verify_full_masking_avx512 (loop_vinfo))
3072             LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3073         }
3074       else /* !LOOP_VINFO_LENS (loop_vinfo).is_empty () */
3075         if (!vect_verify_loop_lens (loop_vinfo))
3076           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3077     }
3078
3079   /* If we're vectorizing a loop that uses length "controls" and
3080      can iterate more than once, we apply decrementing IV approach
3081      in loop control.  */
3082   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3083       && LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_len
3084       && LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0
3085       && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3086            && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
3087                         LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
3088     LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
3089
3090   /* If a loop uses length controls and has a decrementing loop control IV,
3091      we will normally pass that IV through a MIN_EXPR to calcaluate the
3092      basis for the length controls.  E.g. in a loop that processes one
3093      element per scalar iteration, the number of elements would be
3094      MIN_EXPR <N, VF>, where N is the number of scalar iterations left.
3095
3096      This MIN_EXPR approach allows us to use pointer IVs with an invariant
3097      step, since only the final iteration of the vector loop can have
3098      inactive lanes.
3099
3100      However, some targets have a dedicated instruction for calculating the
3101      preferred length, given the total number of elements that still need to
3102      be processed.  This is encapsulated in the SELECT_VL internal function.
3103
3104      If the target supports SELECT_VL, we can use it instead of MIN_EXPR
3105      to determine the basis for the length controls.  However, unlike the
3106      MIN_EXPR calculation, the SELECT_VL calculation can decide to make
3107      lanes inactive in any iteration of the vector loop, not just the last
3108      iteration.  This SELECT_VL approach therefore requires us to use pointer
3109      IVs with variable steps.
3110
3111      Once we've decided how many elements should be processed by one
3112      iteration of the vector loop, we need to populate the rgroup controls.
3113      If a loop has multiple rgroups, we need to make sure that those rgroups
3114      "line up" (that is, they must be consistent about which elements are
3115      active and which aren't).  This is done by vect_adjust_loop_lens_control.
3116
3117      In principle, it would be possible to use vect_adjust_loop_lens_control
3118      on either the result of a MIN_EXPR or the result of a SELECT_VL.
3119      However:
3120
3121      (1) In practice, it only makes sense to use SELECT_VL when a vector
3122          operation will be controlled directly by the result.  It is not
3123          worth using SELECT_VL if it would only be the input to other
3124          calculations.
3125
3126      (2) If we use SELECT_VL for an rgroup that has N controls, each associated
3127          pointer IV will need N updates by a variable amount (N-1 updates
3128          within the iteration and 1 update to move to the next iteration).
3129
3130      Because of this, we prefer to use the MIN_EXPR approach whenever there
3131      is more than one length control.
3132
3133      In addition, SELECT_VL always operates to a granularity of 1 unit.
3134      If we wanted to use it to control an SLP operation on N consecutive
3135      elements, we would need to make the SELECT_VL inputs measure scalar
3136      iterations (rather than elements) and then multiply the SELECT_VL
3137      result by N.  But using SELECT_VL this way is inefficient because
3138      of (1) above.
3139
3140      2. We don't apply SELECT_VL on single-rgroup when both (1) and (2) are
3141         satisfied:
3142
3143      (1). LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) is true.
3144      (2). LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () is true.
3145
3146      Since SELECT_VL (variable step) will make SCEV analysis failed and then
3147      we will fail to gain benefits of following unroll optimizations. We prefer
3148      using the MIN_EXPR approach in this situation.  */
3149   if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
3150     {
3151       tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
3152       if (direct_internal_fn_supported_p (IFN_SELECT_VL, iv_type,
3153                                           OPTIMIZE_FOR_SPEED)
3154           && LOOP_VINFO_LENS (loop_vinfo).length () == 1
3155           && LOOP_VINFO_LENS (loop_vinfo)[0].factor == 1 && !slp
3156           && (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3157               || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()))
3158         LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true;
3159     }
3160
3161   /* Decide whether this loop_vinfo should use partial vectors or peeling,
3162      assuming that the loop will be used as a main loop.  We will redo
3163      this analysis later if we instead decide to use the loop as an
3164      epilogue loop.  */
3165   ok = vect_determine_partial_vectors_and_peeling (loop_vinfo);
3166   if (!ok)
3167     return ok;
3168
3169   /* If we're vectorizing an epilogue loop, the vectorized loop either needs
3170      to be able to handle fewer than VF scalars, or needs to have a lower VF
3171      than the main loop.  */
3172   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
3173       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3174     {
3175       poly_uint64 unscaled_vf
3176         = exact_div (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
3177                      orig_loop_vinfo->suggested_unroll_factor);
3178       if (maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo), unscaled_vf))
3179         return opt_result::failure_at (vect_location,
3180                                        "Vectorization factor too high for"
3181                                        " epilogue loop.\n");
3182     }
3183
3184   /* Check the costings of the loop make vectorizing worthwhile.  */
3185   res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
3186   if (res < 0)
3187     {
3188       ok = opt_result::failure_at (vect_location,
3189                                    "Loop costings may not be worthwhile.\n");
3190       goto again;
3191     }
3192   if (!res)
3193     return opt_result::failure_at (vect_location,
3194                                    "Loop costings not worthwhile.\n");
3195
3196   /* If an epilogue loop is required make sure we can create one.  */
3197   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
3198       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
3199       || LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3200     {
3201       if (dump_enabled_p ())
3202         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
3203       if (!vect_can_advance_ivs_p (loop_vinfo)
3204           || !slpeel_can_duplicate_loop_p (loop,
3205                                            LOOP_VINFO_IV_EXIT (loop_vinfo),
3206                                            LOOP_VINFO_IV_EXIT (loop_vinfo)))
3207         {
3208           ok = opt_result::failure_at (vect_location,
3209                                        "not vectorized: can't create required "
3210                                        "epilog loop\n");
3211           goto again;
3212         }
3213     }
3214
3215   /* During peeling, we need to check if number of loop iterations is
3216      enough for both peeled prolog loop and vector loop.  This check
3217      can be merged along with threshold check of loop versioning, so
3218      increase threshold for this case if necessary.
3219
3220      If we are analyzing an epilogue we still want to check what its
3221      versioning threshold would be.  If we decide to vectorize the epilogues we
3222      will want to use the lowest versioning threshold of all epilogues and main
3223      loop.  This will enable us to enter a vectorized epilogue even when
3224      versioning the loop.  We can't simply check whether the epilogue requires
3225      versioning though since we may have skipped some versioning checks when
3226      analyzing the epilogue.  For instance, checks for alias versioning will be
3227      skipped when dealing with epilogues as we assume we already checked them
3228      for the main loop.  So instead we always check the 'orig_loop_vinfo'.  */
3229   if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
3230     {
3231       poly_uint64 niters_th = 0;
3232       unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3233
3234       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
3235         {
3236           /* Niters for peeled prolog loop.  */
3237           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3238             {
3239               dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
3240               tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
3241               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
3242             }
3243           else
3244             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3245         }
3246
3247       /* Niters for at least one iteration of vectorized loop.  */
3248       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3249         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3250       /* One additional iteration because of peeling for gap.  */
3251       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3252         niters_th += 1;
3253
3254       /*  Use the same condition as vect_transform_loop to decide when to use
3255           the cost to determine a versioning threshold.  */
3256       if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3257           && ordered_p (th, niters_th))
3258         niters_th = ordered_max (poly_uint64 (th), niters_th);
3259
3260       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
3261     }
3262
3263   gcc_assert (known_eq (vectorization_factor,
3264                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
3265
3266   slp_done_for_suggested_uf = slp;
3267
3268   /* Ok to vectorize!  */
3269   LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
3270   return opt_result::success ();
3271
3272 again:
3273   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
3274   gcc_assert (!ok);
3275
3276   /* Try again with SLP forced off but if we didn't do any SLP there is
3277      no point in re-trying.  */
3278   if (!slp)
3279     return ok;
3280
3281   /* If the slp decision is true when suggested unroll factor is worked
3282      out, and we are applying suggested unroll factor, we don't need to
3283      re-try any more.  */
3284   if (applying_suggested_uf && slp_done_for_suggested_uf)
3285     return ok;
3286
3287   /* If there are reduction chains re-trying will fail anyway.  */
3288   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
3289     return ok;
3290
3291   /* Likewise if the grouped loads or stores in the SLP cannot be handled
3292      via interleaving or lane instructions.  */
3293   slp_instance instance;
3294   slp_tree node;
3295   unsigned i, j;
3296   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
3297     {
3298       stmt_vec_info vinfo;
3299       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
3300       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
3301         continue;
3302       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3303       unsigned int size = DR_GROUP_SIZE (vinfo);
3304       tree vectype = STMT_VINFO_VECTYPE (vinfo);
3305       if (vect_store_lanes_supported (vectype, size, false) == IFN_LAST
3306          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
3307          && ! vect_grouped_store_supported (vectype, size))
3308         return opt_result::failure_at (vinfo->stmt,
3309                                        "unsupported grouped store\n");
3310       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
3311         {
3312           vinfo = SLP_TREE_REPRESENTATIVE (node);
3313           if (STMT_VINFO_GROUPED_ACCESS (vinfo))
3314             {
3315               vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3316               bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
3317               size = DR_GROUP_SIZE (vinfo);
3318               vectype = STMT_VINFO_VECTYPE (vinfo);
3319               if (vect_load_lanes_supported (vectype, size, false) == IFN_LAST
3320                   && ! vect_grouped_load_supported (vectype, single_element_p,
3321                                                     size))
3322                 return opt_result::failure_at (vinfo->stmt,
3323                                                "unsupported grouped load\n");
3324             }
3325         }
3326     }
3327
3328   if (dump_enabled_p ())
3329     dump_printf_loc (MSG_NOTE, vect_location,
3330                      "re-trying with SLP disabled\n");
3331
3332   /* Roll back state appropriately.  No SLP this time.  */
3333   slp = false;
3334   /* Restore vectorization factor as it were without SLP.  */
3335   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
3336   /* Free the SLP instances.  */
3337   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
3338     vect_free_slp_instance (instance);
3339   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
3340   /* Reset SLP type to loop_vect on all stmts.  */
3341   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
3342     {
3343       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3344       for (gimple_stmt_iterator si = gsi_start_phis (bb);
3345            !gsi_end_p (si); gsi_next (&si))
3346         {
3347           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3348           STMT_SLP_TYPE (stmt_info) = loop_vect;
3349           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
3350               || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
3351             {
3352               /* vectorizable_reduction adjusts reduction stmt def-types,
3353                  restore them to that of the PHI.  */
3354               STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
3355                 = STMT_VINFO_DEF_TYPE (stmt_info);
3356               STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
3357                                         (STMT_VINFO_REDUC_DEF (stmt_info)))
3358                 = STMT_VINFO_DEF_TYPE (stmt_info);
3359             }
3360         }
3361       for (gimple_stmt_iterator si = gsi_start_bb (bb);
3362            !gsi_end_p (si); gsi_next (&si))
3363         {
3364           if (is_gimple_debug (gsi_stmt (si)))
3365             continue;
3366           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3367           STMT_SLP_TYPE (stmt_info) = loop_vect;
3368           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
3369             {
3370               stmt_vec_info pattern_stmt_info
3371                 = STMT_VINFO_RELATED_STMT (stmt_info);
3372               if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
3373                 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
3374
3375               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
3376               STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
3377               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
3378                    !gsi_end_p (pi); gsi_next (&pi))
3379                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
3380                   = loop_vect;
3381             }
3382         }
3383     }
3384   /* Free optimized alias test DDRS.  */
3385   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
3386   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
3387   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
3388   /* Reset target cost data.  */
3389   delete loop_vinfo->vector_costs;
3390   loop_vinfo->vector_costs = nullptr;
3391   /* Reset accumulated rgroup information.  */
3392   LOOP_VINFO_MASKS (loop_vinfo).mask_set.empty ();
3393   release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
3394   release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
3395   /* Reset assorted flags.  */
3396   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
3397   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
3398   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
3399   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
3400   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3401     = saved_can_use_partial_vectors_p;
3402
3403   goto start_over;
3404 }
3405
3406 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
3407    to be better than vectorizing it using OLD_LOOP_VINFO.  Assume that
3408    OLD_LOOP_VINFO is better unless something specifically indicates
3409    otherwise.
3410
3411    Note that this deliberately isn't a partial order.  */
3412
3413 static bool
3414 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
3415                           loop_vec_info old_loop_vinfo)
3416 {
3417   struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
3418   gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
3419
3420   poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
3421   poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
3422
3423   /* Always prefer a VF of loop->simdlen over any other VF.  */
3424   if (loop->simdlen)
3425     {
3426       bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
3427       bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
3428       if (new_simdlen_p != old_simdlen_p)
3429         return new_simdlen_p;
3430     }
3431
3432   const auto *old_costs = old_loop_vinfo->vector_costs;
3433   const auto *new_costs = new_loop_vinfo->vector_costs;
3434   if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
3435     return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
3436
3437   return new_costs->better_main_loop_than_p (old_costs);
3438 }
3439
3440 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO.  Return
3441    true if we should.  */
3442
3443 static bool
3444 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
3445                         loop_vec_info old_loop_vinfo)
3446 {
3447   if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
3448     return false;
3449
3450   if (dump_enabled_p ())
3451     dump_printf_loc (MSG_NOTE, vect_location,
3452                      "***** Preferring vector mode %s to vector mode %s\n",
3453                      GET_MODE_NAME (new_loop_vinfo->vector_mode),
3454                      GET_MODE_NAME (old_loop_vinfo->vector_mode));
3455   return true;
3456 }
3457
3458 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
3459    not NULL.  Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
3460    MODE_I to the next mode useful to analyze.
3461    Return the loop_vinfo on success and wrapped null on failure.  */
3462
3463 static opt_loop_vec_info
3464 vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
3465                      const vect_loop_form_info *loop_form_info,
3466                      loop_vec_info main_loop_vinfo,
3467                      const vector_modes &vector_modes, unsigned &mode_i,
3468                      machine_mode &autodetected_vector_mode,
3469                      bool &fatal)
3470 {
3471   loop_vec_info loop_vinfo
3472     = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3473
3474   machine_mode vector_mode = vector_modes[mode_i];
3475   loop_vinfo->vector_mode = vector_mode;
3476   unsigned int suggested_unroll_factor = 1;
3477   bool slp_done_for_suggested_uf = false;
3478
3479   /* Run the main analysis.  */
3480   opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,
3481                                         &suggested_unroll_factor,
3482                                         slp_done_for_suggested_uf);
3483   if (dump_enabled_p ())
3484     dump_printf_loc (MSG_NOTE, vect_location,
3485                      "***** Analysis %s with vector mode %s\n",
3486                      res ? "succeeded" : " failed",
3487                      GET_MODE_NAME (loop_vinfo->vector_mode));
3488
3489   if (res && !main_loop_vinfo && suggested_unroll_factor > 1)
3490     {
3491       if (dump_enabled_p ())
3492         dump_printf_loc (MSG_NOTE, vect_location,
3493                          "***** Re-trying analysis for unrolling"
3494                          " with unroll factor %d and slp %s.\n",
3495                          suggested_unroll_factor,
3496                          slp_done_for_suggested_uf ? "on" : "off");
3497       loop_vec_info unroll_vinfo
3498         = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3499       unroll_vinfo->vector_mode = vector_mode;
3500       unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
3501       opt_result new_res = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL,
3502                                                 slp_done_for_suggested_uf);
3503       if (new_res)
3504         {
3505           delete loop_vinfo;
3506           loop_vinfo = unroll_vinfo;
3507         }
3508       else
3509         delete unroll_vinfo;
3510     }
3511
3512   /* Remember the autodetected vector mode.  */
3513   if (vector_mode == VOIDmode)
3514     autodetected_vector_mode = loop_vinfo->vector_mode;
3515
3516   /* Advance mode_i, first skipping modes that would result in the
3517      same analysis result.  */
3518   while (mode_i + 1 < vector_modes.length ()
3519          && vect_chooses_same_modes_p (loop_vinfo,
3520                                        vector_modes[mode_i + 1]))
3521     {
3522       if (dump_enabled_p ())
3523         dump_printf_loc (MSG_NOTE, vect_location,
3524                          "***** The result for vector mode %s would"
3525                          " be the same\n",
3526                          GET_MODE_NAME (vector_modes[mode_i + 1]));
3527       mode_i += 1;
3528     }
3529   if (mode_i + 1 < vector_modes.length ()
3530       && VECTOR_MODE_P (autodetected_vector_mode)
3531       && (related_vector_mode (vector_modes[mode_i + 1],
3532                                GET_MODE_INNER (autodetected_vector_mode))
3533           == autodetected_vector_mode)
3534       && (related_vector_mode (autodetected_vector_mode,
3535                                GET_MODE_INNER (vector_modes[mode_i + 1]))
3536           == vector_modes[mode_i + 1]))
3537     {
3538       if (dump_enabled_p ())
3539         dump_printf_loc (MSG_NOTE, vect_location,
3540                          "***** Skipping vector mode %s, which would"
3541                          " repeat the analysis for %s\n",
3542                          GET_MODE_NAME (vector_modes[mode_i + 1]),
3543                          GET_MODE_NAME (autodetected_vector_mode));
3544       mode_i += 1;
3545     }
3546   mode_i++;
3547
3548   if (!res)
3549     {
3550       delete loop_vinfo;
3551       if (fatal)
3552         gcc_checking_assert (main_loop_vinfo == NULL);
3553       return opt_loop_vec_info::propagate_failure (res);
3554     }
3555
3556   return opt_loop_vec_info::success (loop_vinfo);
3557 }
3558
3559 /* Function vect_analyze_loop.
3560
3561    Apply a set of analyses on LOOP, and create a loop_vec_info struct
3562    for it.  The different analyses will record information in the
3563    loop_vec_info struct.  */
3564 opt_loop_vec_info
3565 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
3566 {
3567   DUMP_VECT_SCOPE ("analyze_loop_nest");
3568
3569   if (loop_outer (loop)
3570       && loop_vec_info_for_loop (loop_outer (loop))
3571       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
3572     return opt_loop_vec_info::failure_at (vect_location,
3573                                           "outer-loop already vectorized.\n");
3574
3575   if (!find_loop_nest (loop, &shared->loop_nest))
3576     return opt_loop_vec_info::failure_at
3577       (vect_location,
3578        "not vectorized: loop nest containing two or more consecutive inner"
3579        " loops cannot be vectorized\n");
3580
3581   /* Analyze the loop form.  */
3582   vect_loop_form_info loop_form_info;
3583   opt_result res = vect_analyze_loop_form (loop, &loop_form_info);
3584   if (!res)
3585     {
3586       if (dump_enabled_p ())
3587         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3588                          "bad loop form.\n");
3589       return opt_loop_vec_info::propagate_failure (res);
3590     }
3591   if (!integer_onep (loop_form_info.assumptions))
3592     {
3593       /* We consider to vectorize this loop by versioning it under
3594          some assumptions.  In order to do this, we need to clear
3595          existing information computed by scev and niter analyzer.  */
3596       scev_reset_htab ();
3597       free_numbers_of_iterations_estimates (loop);
3598       /* Also set flag for this loop so that following scev and niter
3599          analysis are done under the assumptions.  */
3600       loop_constraint_set (loop, LOOP_C_FINITE);
3601     }
3602   else
3603     /* Clear the existing niter information to make sure the nonwrapping flag
3604        will be calculated and set propriately.  */
3605     free_numbers_of_iterations_estimates (loop);
3606
3607   auto_vector_modes vector_modes;
3608   /* Autodetect first vector size we try.  */
3609   vector_modes.safe_push (VOIDmode);
3610   unsigned int autovec_flags
3611     = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
3612                                                     loop->simdlen != 0);
3613   bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
3614                              && !unlimited_cost_model (loop));
3615   machine_mode autodetected_vector_mode = VOIDmode;
3616   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
3617   unsigned int mode_i = 0;
3618   unsigned HOST_WIDE_INT simdlen = loop->simdlen;
3619
3620   /* Keep track of the VF for each mode.  Initialize all to 0 which indicates
3621      a mode has not been analyzed.  */
3622   auto_vec<poly_uint64, 8> cached_vf_per_mode;
3623   for (unsigned i = 0; i < vector_modes.length (); ++i)
3624     cached_vf_per_mode.safe_push (0);
3625
3626   /* First determine the main loop vectorization mode, either the first
3627      one that works, starting with auto-detecting the vector mode and then
3628      following the targets order of preference, or the one with the
3629      lowest cost if pick_lowest_cost_p.  */
3630   while (1)
3631     {
3632       bool fatal;
3633       unsigned int last_mode_i = mode_i;
3634       /* Set cached VF to -1 prior to analysis, which indicates a mode has
3635          failed.  */
3636       cached_vf_per_mode[last_mode_i] = -1;
3637       opt_loop_vec_info loop_vinfo
3638         = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3639                                NULL, vector_modes, mode_i,
3640                                autodetected_vector_mode, fatal);
3641       if (fatal)
3642         break;
3643
3644       if (loop_vinfo)
3645         {
3646           /*  Analyzis has been successful so update the VF value.  The
3647               VF should always be a multiple of unroll_factor and we want to
3648               capture the original VF here.  */
3649           cached_vf_per_mode[last_mode_i]
3650             = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
3651                          loop_vinfo->suggested_unroll_factor);
3652           /* Once we hit the desired simdlen for the first time,
3653              discard any previous attempts.  */
3654           if (simdlen
3655               && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3656             {
3657               delete first_loop_vinfo;
3658               first_loop_vinfo = opt_loop_vec_info::success (NULL);
3659               simdlen = 0;
3660             }
3661           else if (pick_lowest_cost_p
3662                    && first_loop_vinfo
3663                    && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3664             {
3665               /* Pick loop_vinfo over first_loop_vinfo.  */
3666               delete first_loop_vinfo;
3667               first_loop_vinfo = opt_loop_vec_info::success (NULL);
3668             }
3669           if (first_loop_vinfo == NULL)
3670             first_loop_vinfo = loop_vinfo;
3671           else
3672             {
3673               delete loop_vinfo;
3674               loop_vinfo = opt_loop_vec_info::success (NULL);
3675             }
3676
3677           /* Commit to first_loop_vinfo if we have no reason to try
3678              alternatives.  */
3679           if (!simdlen && !pick_lowest_cost_p)
3680             break;
3681         }
3682       if (mode_i == vector_modes.length ()
3683           || autodetected_vector_mode == VOIDmode)
3684         break;
3685
3686       /* Try the next biggest vector size.  */
3687       if (dump_enabled_p ())
3688         dump_printf_loc (MSG_NOTE, vect_location,
3689                          "***** Re-trying analysis with vector mode %s\n",
3690                          GET_MODE_NAME (vector_modes[mode_i]));
3691     }
3692   if (!first_loop_vinfo)
3693     return opt_loop_vec_info::propagate_failure (res);
3694
3695   if (dump_enabled_p ())
3696     dump_printf_loc (MSG_NOTE, vect_location,
3697                      "***** Choosing vector mode %s\n",
3698                      GET_MODE_NAME (first_loop_vinfo->vector_mode));
3699
3700   /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3701      enabled, SIMDUID is not set, it is the innermost loop and we have
3702      either already found the loop's SIMDLEN or there was no SIMDLEN to
3703      begin with.
3704      TODO: Enable epilogue vectorization for loops with SIMDUID set.  */
3705   bool vect_epilogues = (!simdlen
3706                          && loop->inner == NULL
3707                          && param_vect_epilogues_nomask
3708                          && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3709                            /* No code motion support for multiple epilogues so for now
3710                               not supported when multiple exits.  */
3711                          && !LOOP_VINFO_EARLY_BREAKS (first_loop_vinfo)
3712                          && !loop->simduid);
3713   if (!vect_epilogues)
3714     return first_loop_vinfo;
3715
3716   /* Now analyze first_loop_vinfo for epilogue vectorization.  */
3717   poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3718
3719   /* For epilogues start the analysis from the first mode.  The motivation
3720      behind starting from the beginning comes from cases where the VECTOR_MODES
3721      array may contain length-agnostic and length-specific modes.  Their
3722      ordering is not guaranteed, so we could end up picking a mode for the main
3723      loop that is after the epilogue's optimal mode.  */
3724   vector_modes[0] = autodetected_vector_mode;
3725   mode_i = 0;
3726
3727   bool supports_partial_vectors =
3728     partial_vectors_supported_p () && param_vect_partial_vector_usage != 0;
3729   poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3730
3731   while (1)
3732     {
3733       /* If the target does not support partial vectors we can shorten the
3734          number of modes to analyze for the epilogue as we know we can't pick a
3735          mode that would lead to a VF at least as big as the
3736          FIRST_VINFO_VF.  */
3737       if (!supports_partial_vectors
3738           && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3739         {
3740           mode_i++;
3741           if (mode_i == vector_modes.length ())
3742             break;
3743           continue;
3744         }
3745
3746       if (dump_enabled_p ())
3747         dump_printf_loc (MSG_NOTE, vect_location,
3748                          "***** Re-trying epilogue analysis with vector "
3749                          "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3750
3751       bool fatal;
3752       opt_loop_vec_info loop_vinfo
3753         = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3754                                first_loop_vinfo,
3755                                vector_modes, mode_i,
3756                                autodetected_vector_mode, fatal);
3757       if (fatal)
3758         break;
3759
3760       if (loop_vinfo)
3761         {
3762           if (pick_lowest_cost_p)
3763             {
3764               /* Keep trying to roll back vectorization attempts while the
3765                  loop_vec_infos they produced were worse than this one.  */
3766               vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3767               while (!vinfos.is_empty ()
3768                      && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3769                 {
3770                   gcc_assert (vect_epilogues);
3771                   delete vinfos.pop ();
3772                 }
3773             }
3774           /* For now only allow one epilogue loop.  */
3775           if (first_loop_vinfo->epilogue_vinfos.is_empty ())
3776             {
3777               first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3778               poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3779               gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3780                           || maybe_ne (lowest_th, 0U));
3781               /* Keep track of the known smallest versioning
3782                  threshold.  */
3783               if (ordered_p (lowest_th, th))
3784                 lowest_th = ordered_min (lowest_th, th);
3785             }
3786           else
3787             {
3788               delete loop_vinfo;
3789               loop_vinfo = opt_loop_vec_info::success (NULL);
3790             }
3791
3792           /* For now only allow one epilogue loop, but allow
3793              pick_lowest_cost_p to replace it, so commit to the
3794              first epilogue if we have no reason to try alternatives.  */
3795           if (!pick_lowest_cost_p)
3796             break;
3797         }
3798
3799       if (mode_i == vector_modes.length ())
3800         break;
3801
3802     }
3803
3804   if (!first_loop_vinfo->epilogue_vinfos.is_empty ())
3805     {
3806       LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3807       if (dump_enabled_p ())
3808         dump_printf_loc (MSG_NOTE, vect_location,
3809                          "***** Choosing epilogue vector mode %s\n",
3810                          GET_MODE_NAME
3811                            (first_loop_vinfo->epilogue_vinfos[0]->vector_mode));
3812     }
3813
3814   return first_loop_vinfo;
3815 }
3816
3817 /* Return true if there is an in-order reduction function for CODE, storing
3818    it in *REDUC_FN if so.  */
3819
3820 static bool
3821 fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3822 {
3823   /* We support MINUS_EXPR by negating the operand.  This also preserves an
3824      initial -0.0 since -0.0 - 0.0 (neutral op for MINUS_EXPR) == -0.0 +
3825      (-0.0) = -0.0.  */
3826   if (code == PLUS_EXPR || code == MINUS_EXPR)
3827     {
3828       *reduc_fn = IFN_FOLD_LEFT_PLUS;
3829       return true;
3830     }
3831   return false;
3832 }
3833
3834 /* Function reduction_fn_for_scalar_code
3835
3836    Input:
3837    CODE - tree_code of a reduction operations.
3838
3839    Output:
3840    REDUC_FN - the corresponding internal function to be used to reduce the
3841       vector of partial results into a single scalar result, or IFN_LAST
3842       if the operation is a supported reduction operation, but does not have
3843       such an internal function.
3844
3845    Return FALSE if CODE currently cannot be vectorized as reduction.  */
3846
3847 bool
3848 reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3849 {
3850   if (code.is_tree_code ())
3851     switch (tree_code (code))
3852       {
3853       case MAX_EXPR:
3854         *reduc_fn = IFN_REDUC_MAX;
3855         return true;
3856
3857       case MIN_EXPR:
3858         *reduc_fn = IFN_REDUC_MIN;
3859         return true;
3860
3861       case PLUS_EXPR:
3862         *reduc_fn = IFN_REDUC_PLUS;
3863         return true;
3864
3865       case BIT_AND_EXPR:
3866         *reduc_fn = IFN_REDUC_AND;
3867         return true;
3868
3869       case BIT_IOR_EXPR:
3870         *reduc_fn = IFN_REDUC_IOR;
3871         return true;
3872
3873       case BIT_XOR_EXPR:
3874         *reduc_fn = IFN_REDUC_XOR;
3875         return true;
3876
3877       case MULT_EXPR:
3878       case MINUS_EXPR:
3879         *reduc_fn = IFN_LAST;
3880         return true;
3881
3882       default:
3883         return false;
3884       }
3885   else
3886     switch (combined_fn (code))
3887       {
3888       CASE_CFN_FMAX:
3889         *reduc_fn = IFN_REDUC_FMAX;
3890         return true;
3891
3892       CASE_CFN_FMIN:
3893         *reduc_fn = IFN_REDUC_FMIN;
3894         return true;
3895
3896       default:
3897         return false;
3898       }
3899 }
3900
3901 /* If there is a neutral value X such that a reduction would not be affected
3902    by the introduction of additional X elements, return that X, otherwise
3903    return null.  CODE is the code of the reduction and SCALAR_TYPE is type
3904    of the scalar elements.  If the reduction has just a single initial value
3905    then INITIAL_VALUE is that value, otherwise it is null.
3906    If AS_INITIAL is TRUE the value is supposed to be used as initial value.
3907    In that case no signed zero is returned.  */
3908
3909 tree
3910 neutral_op_for_reduction (tree scalar_type, code_helper code,
3911                           tree initial_value, bool as_initial)
3912 {
3913   if (code.is_tree_code ())
3914     switch (tree_code (code))
3915       {
3916       case DOT_PROD_EXPR:
3917       case SAD_EXPR:
3918       case MINUS_EXPR:
3919       case BIT_IOR_EXPR:
3920       case BIT_XOR_EXPR:
3921         return build_zero_cst (scalar_type);
3922       case WIDEN_SUM_EXPR:
3923       case PLUS_EXPR:
3924         if (!as_initial && HONOR_SIGNED_ZEROS (scalar_type))
3925           return build_real (scalar_type, dconstm0);
3926         else
3927           return build_zero_cst (scalar_type);
3928
3929       case MULT_EXPR:
3930         return build_one_cst (scalar_type);
3931
3932       case BIT_AND_EXPR:
3933         return build_all_ones_cst (scalar_type);
3934
3935       case MAX_EXPR:
3936       case MIN_EXPR:
3937         return initial_value;
3938
3939       default:
3940         return NULL_TREE;
3941       }
3942   else
3943     switch (combined_fn (code))
3944       {
3945       CASE_CFN_FMIN:
3946       CASE_CFN_FMAX:
3947         return initial_value;
3948
3949       default:
3950         return NULL_TREE;
3951       }
3952 }
3953
3954 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
3955    STMT is printed with a message MSG. */
3956
3957 static void
3958 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3959 {
3960   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3961 }
3962
3963 /* Return true if we need an in-order reduction for operation CODE
3964    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3965    overflow must wrap.  */
3966
3967 bool
3968 needs_fold_left_reduction_p (tree type, code_helper code)
3969 {
3970   /* CHECKME: check for !flag_finite_math_only too?  */
3971   if (SCALAR_FLOAT_TYPE_P (type))
3972     {
3973       if (code.is_tree_code ())
3974         switch (tree_code (code))
3975           {
3976           case MIN_EXPR:
3977           case MAX_EXPR:
3978             return false;
3979
3980           default:
3981             return !flag_associative_math;
3982           }
3983       else
3984         switch (combined_fn (code))
3985           {
3986           CASE_CFN_FMIN:
3987           CASE_CFN_FMAX:
3988             return false;
3989
3990           default:
3991             return !flag_associative_math;
3992           }
3993     }
3994
3995   if (INTEGRAL_TYPE_P (type))
3996     return (!code.is_tree_code ()
3997             || !operation_no_trapping_overflow (type, tree_code (code)));
3998
3999   if (SAT_FIXED_POINT_TYPE_P (type))
4000     return true;
4001
4002   return false;
4003 }
4004
4005 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
4006    has a handled computation expression.  Store the main reduction
4007    operation in *CODE.  */
4008
4009 static bool
4010 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
4011                       tree loop_arg, code_helper *code,
4012                       vec<std::pair<ssa_op_iter, use_operand_p> > &path)
4013 {
4014   auto_bitmap visited;
4015   tree lookfor = PHI_RESULT (phi);
4016   ssa_op_iter curri;
4017   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
4018   while (USE_FROM_PTR (curr) != loop_arg)
4019     curr = op_iter_next_use (&curri);
4020   curri.i = curri.numops;
4021   do
4022     {
4023       path.safe_push (std::make_pair (curri, curr));
4024       tree use = USE_FROM_PTR (curr);
4025       if (use == lookfor)
4026         break;
4027       gimple *def = SSA_NAME_DEF_STMT (use);
4028       if (gimple_nop_p (def)
4029           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
4030         {
4031 pop:
4032           do
4033             {
4034               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
4035               curri = x.first;
4036               curr = x.second;
4037               do
4038                 curr = op_iter_next_use (&curri);
4039               /* Skip already visited or non-SSA operands (from iterating
4040                  over PHI args).  */
4041               while (curr != NULL_USE_OPERAND_P
4042                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
4043                          || ! bitmap_set_bit (visited,
4044                                               SSA_NAME_VERSION
4045                                                 (USE_FROM_PTR (curr)))));
4046             }
4047           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
4048           if (curr == NULL_USE_OPERAND_P)
4049             break;
4050         }
4051       else
4052         {
4053           if (gimple_code (def) == GIMPLE_PHI)
4054             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
4055           else
4056             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
4057           while (curr != NULL_USE_OPERAND_P
4058                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
4059                      || ! bitmap_set_bit (visited,
4060                                           SSA_NAME_VERSION
4061                                             (USE_FROM_PTR (curr)))))
4062             curr = op_iter_next_use (&curri);
4063           if (curr == NULL_USE_OPERAND_P)
4064             goto pop;
4065         }
4066     }
4067   while (1);
4068   if (dump_file && (dump_flags & TDF_DETAILS))
4069     {
4070       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
4071       unsigned i;
4072       std::pair<ssa_op_iter, use_operand_p> *x;
4073       FOR_EACH_VEC_ELT (path, i, x)
4074         dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
4075       dump_printf (MSG_NOTE, "\n");
4076     }
4077
4078   /* Check whether the reduction path detected is valid.  */
4079   bool fail = path.length () == 0;
4080   bool neg = false;
4081   int sign = -1;
4082   *code = ERROR_MARK;
4083   for (unsigned i = 1; i < path.length (); ++i)
4084     {
4085       gimple *use_stmt = USE_STMT (path[i].second);
4086       gimple_match_op op;
4087       if (!gimple_extract_op (use_stmt, &op))
4088         {
4089           fail = true;
4090           break;
4091         }
4092       unsigned int opi = op.num_ops;
4093       if (gassign *assign = dyn_cast<gassign *> (use_stmt))
4094         {
4095           /* The following make sure we can compute the operand index
4096              easily plus it mostly disallows chaining via COND_EXPR condition
4097              operands.  */
4098           for (opi = 0; opi < op.num_ops; ++opi)
4099             if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
4100               break;
4101         }
4102       else if (gcall *call = dyn_cast<gcall *> (use_stmt))
4103         {
4104           for (opi = 0; opi < op.num_ops; ++opi)
4105             if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
4106               break;
4107         }
4108       if (opi == op.num_ops)
4109         {
4110           fail = true;
4111           break;
4112         }
4113       op.code = canonicalize_code (op.code, op.type);
4114       if (op.code == MINUS_EXPR)
4115         {
4116           op.code = PLUS_EXPR;
4117           /* Track whether we negate the reduction value each iteration.  */
4118           if (op.ops[1] == op.ops[opi])
4119             neg = ! neg;
4120         }
4121       else if (op.code == IFN_COND_SUB)
4122         {
4123           op.code = IFN_COND_ADD;
4124           /* Track whether we negate the reduction value each iteration.  */
4125           if (op.ops[2] == op.ops[opi])
4126             neg = ! neg;
4127         }
4128       if (CONVERT_EXPR_CODE_P (op.code)
4129           && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
4130         ;
4131       else if (*code == ERROR_MARK)
4132         {
4133           *code = op.code;
4134           sign = TYPE_SIGN (op.type);
4135         }
4136       else if (op.code != *code)
4137         {
4138           fail = true;
4139           break;
4140         }
4141       else if ((op.code == MIN_EXPR
4142                 || op.code == MAX_EXPR)
4143                && sign != TYPE_SIGN (op.type))
4144         {
4145           fail = true;
4146           break;
4147         }
4148       /* Check there's only a single stmt the op is used on.  For the
4149          not value-changing tail and the last stmt allow out-of-loop uses.
4150          ???  We could relax this and handle arbitrary live stmts by
4151          forcing a scalar epilogue for example.  */
4152       imm_use_iterator imm_iter;
4153       use_operand_p use_p;
4154       gimple *op_use_stmt;
4155       unsigned cnt = 0;
4156       bool cond_fn_p = op.code.is_internal_fn ()
4157         && (conditional_internal_fn_code (internal_fn (op.code))
4158             != ERROR_MARK);
4159
4160       FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
4161         {
4162         /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
4163            op1 twice (once as definition, once as else) in the same operation.
4164            Allow this.  */
4165           if (cond_fn_p && op_use_stmt == use_stmt)
4166             {
4167               gcall *call = as_a<gcall *> (use_stmt);
4168               unsigned else_pos
4169                 = internal_fn_else_index (internal_fn (op.code));
4170
4171               for (unsigned int j = 0; j < gimple_call_num_args (call); ++j)
4172                 {
4173                   if (j == else_pos)
4174                     continue;
4175                   if (gimple_call_arg (call, j) == op.ops[opi])
4176                     cnt++;
4177                 }
4178             }
4179           else if (!is_gimple_debug (op_use_stmt)
4180                    && (*code != ERROR_MARK
4181                        || flow_bb_inside_loop_p (loop,
4182                                                  gimple_bb (op_use_stmt))))
4183             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4184               cnt++;
4185         }
4186
4187       if (cnt != 1)
4188         {
4189           fail = true;
4190           break;
4191         }
4192     }
4193   return ! fail && ! neg && *code != ERROR_MARK;
4194 }
4195
4196 bool
4197 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
4198                       tree loop_arg, enum tree_code code)
4199 {
4200   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4201   code_helper code_;
4202   return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
4203           && code_ == code);
4204 }
4205
4206
4207
4208 /* Function vect_is_simple_reduction
4209
4210    (1) Detect a cross-iteration def-use cycle that represents a simple
4211    reduction computation.  We look for the following pattern:
4212
4213    loop_header:
4214      a1 = phi < a0, a2 >
4215      a3 = ...
4216      a2 = operation (a3, a1)
4217
4218    or
4219
4220    a3 = ...
4221    loop_header:
4222      a1 = phi < a0, a2 >
4223      a2 = operation (a3, a1)
4224
4225    such that:
4226    1. operation is commutative and associative and it is safe to
4227       change the order of the computation
4228    2. no uses for a2 in the loop (a2 is used out of the loop)
4229    3. no uses of a1 in the loop besides the reduction operation
4230    4. no uses of a1 outside the loop.
4231
4232    Conditions 1,4 are tested here.
4233    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
4234
4235    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
4236    nested cycles.
4237
4238    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
4239    reductions:
4240
4241      a1 = phi < a0, a2 >
4242      inner loop (def of a3)
4243      a2 = phi < a3 >
4244
4245    (4) Detect condition expressions, ie:
4246      for (int i = 0; i < N; i++)
4247        if (a[i] < val)
4248         ret_val = a[i];
4249
4250 */
4251
4252 static stmt_vec_info
4253 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
4254                           bool *double_reduc, bool *reduc_chain_p, bool slp)
4255 {
4256   gphi *phi = as_a <gphi *> (phi_info->stmt);
4257   gimple *phi_use_stmt = NULL;
4258   imm_use_iterator imm_iter;
4259   use_operand_p use_p;
4260
4261   *double_reduc = false;
4262   *reduc_chain_p = false;
4263   STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
4264
4265   tree phi_name = PHI_RESULT (phi);
4266   /* ???  If there are no uses of the PHI result the inner loop reduction
4267      won't be detected as possibly double-reduction by vectorizable_reduction
4268      because that tries to walk the PHI arg from the preheader edge which
4269      can be constant.  See PR60382.  */
4270   if (has_zero_uses (phi_name))
4271     return NULL;
4272   class loop *loop = (gimple_bb (phi))->loop_father;
4273   unsigned nphi_def_loop_uses = 0;
4274   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
4275     {
4276       gimple *use_stmt = USE_STMT (use_p);
4277       if (is_gimple_debug (use_stmt))
4278         continue;
4279
4280       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4281         {
4282           if (dump_enabled_p ())
4283             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4284                              "intermediate value used outside loop.\n");
4285
4286           return NULL;
4287         }
4288
4289       /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
4290          op1 twice (once as definition, once as else) in the same operation.
4291          Only count it as one. */
4292       if (use_stmt != phi_use_stmt)
4293         {
4294           nphi_def_loop_uses++;
4295           phi_use_stmt = use_stmt;
4296         }
4297     }
4298
4299   tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
4300   if (TREE_CODE (latch_def) != SSA_NAME)
4301     {
4302       if (dump_enabled_p ())
4303         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4304                          "reduction: not ssa_name: %T\n", latch_def);
4305       return NULL;
4306     }
4307
4308   stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
4309   if (!def_stmt_info
4310       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
4311     return NULL;
4312
4313   bool nested_in_vect_loop
4314     = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
4315   unsigned nlatch_def_loop_uses = 0;
4316   auto_vec<gphi *, 3> lcphis;
4317   bool inner_loop_of_double_reduc = false;
4318   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
4319     {
4320       gimple *use_stmt = USE_STMT (use_p);
4321       if (is_gimple_debug (use_stmt))
4322         continue;
4323       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4324         nlatch_def_loop_uses++;
4325       else
4326         {
4327           /* We can have more than one loop-closed PHI.  */
4328           lcphis.safe_push (as_a <gphi *> (use_stmt));
4329           if (nested_in_vect_loop
4330               && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
4331                   == vect_double_reduction_def))
4332             inner_loop_of_double_reduc = true;
4333         }
4334     }
4335
4336   /* If we are vectorizing an inner reduction we are executing that
4337      in the original order only in case we are not dealing with a
4338      double reduction.  */
4339   if (nested_in_vect_loop && !inner_loop_of_double_reduc)
4340     {
4341       if (dump_enabled_p ())
4342         report_vect_op (MSG_NOTE, def_stmt_info->stmt,
4343                         "detected nested cycle: ");
4344       return def_stmt_info;
4345     }
4346
4347   /* When the inner loop of a double reduction ends up with more than
4348      one loop-closed PHI we have failed to classify alternate such
4349      PHIs as double reduction, leading to wrong code.  See PR103237.  */
4350   if (inner_loop_of_double_reduc && lcphis.length () != 1)
4351     {
4352       if (dump_enabled_p ())
4353         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4354                          "unhandle double reduction\n");
4355       return NULL;
4356     }
4357
4358   /* If this isn't a nested cycle or if the nested cycle reduction value
4359      is used ouside of the inner loop we cannot handle uses of the reduction
4360      value.  */
4361   if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
4362     {
4363       if (dump_enabled_p ())
4364         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4365                          "reduction used in loop.\n");
4366       return NULL;
4367     }
4368
4369   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
4370      defined in the inner loop.  */
4371   if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
4372     {
4373       tree op1 = PHI_ARG_DEF (def_stmt, 0);
4374       if (gimple_phi_num_args (def_stmt) != 1
4375           || TREE_CODE (op1) != SSA_NAME)
4376         {
4377           if (dump_enabled_p ())
4378             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4379                              "unsupported phi node definition.\n");
4380
4381           return NULL;
4382         }
4383
4384       /* Verify there is an inner cycle composed of the PHI phi_use_stmt
4385          and the latch definition op1.  */
4386       gimple *def1 = SSA_NAME_DEF_STMT (op1);
4387       if (gimple_bb (def1)
4388           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4389           && loop->inner
4390           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
4391           && (is_gimple_assign (def1) || is_gimple_call (def1))
4392           && is_a <gphi *> (phi_use_stmt)
4393           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt))
4394           && (op1 == PHI_ARG_DEF_FROM_EDGE (phi_use_stmt,
4395                                             loop_latch_edge (loop->inner))))
4396         {
4397           if (dump_enabled_p ())
4398             report_vect_op (MSG_NOTE, def_stmt,
4399                             "detected double reduction: ");
4400
4401           *double_reduc = true;
4402           return def_stmt_info;
4403         }
4404
4405       return NULL;
4406     }
4407
4408   /* Look for the expression computing latch_def from then loop PHI result.  */
4409   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4410   code_helper code;
4411   if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
4412                             path))
4413     {
4414       STMT_VINFO_REDUC_CODE (phi_info) = code;
4415       if (code == COND_EXPR && !nested_in_vect_loop)
4416         STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
4417
4418       /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
4419          reduction chain for which the additional restriction is that
4420          all operations in the chain are the same.  */
4421       auto_vec<stmt_vec_info, 8> reduc_chain;
4422       unsigned i;
4423       bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
4424       for (i = path.length () - 1; i >= 1; --i)
4425         {
4426           gimple *stmt = USE_STMT (path[i].second);
4427           stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
4428           gimple_match_op op;
4429           if (!gimple_extract_op (stmt, &op))
4430             gcc_unreachable ();
4431           if (gassign *assign = dyn_cast<gassign *> (stmt))
4432             STMT_VINFO_REDUC_IDX (stmt_info)
4433               = path[i].second->use - gimple_assign_rhs1_ptr (assign);
4434           else
4435             {
4436               gcall *call = as_a<gcall *> (stmt);
4437               STMT_VINFO_REDUC_IDX (stmt_info)
4438                 = path[i].second->use - gimple_call_arg_ptr (call, 0);
4439             }
4440           bool leading_conversion = (CONVERT_EXPR_CODE_P (op.code)
4441                                      && (i == 1 || i == path.length () - 1));
4442           if ((op.code != code && !leading_conversion)
4443               /* We can only handle the final value in epilogue
4444                  generation for reduction chains.  */
4445               || (i != 1 && !has_single_use (gimple_get_lhs (stmt))))
4446             is_slp_reduc = false;
4447           /* For reduction chains we support a trailing/leading
4448              conversions.  We do not store those in the actual chain.  */
4449           if (leading_conversion)
4450             continue;
4451           reduc_chain.safe_push (stmt_info);
4452         }
4453       if (slp && is_slp_reduc && reduc_chain.length () > 1)
4454         {
4455           for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
4456             {
4457               REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
4458               REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
4459             }
4460           REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
4461           REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
4462
4463           /* Save the chain for further analysis in SLP detection.  */
4464           LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
4465           REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
4466
4467           *reduc_chain_p = true;
4468           if (dump_enabled_p ())
4469             dump_printf_loc (MSG_NOTE, vect_location,
4470                             "reduction: detected reduction chain\n");
4471         }
4472       else if (dump_enabled_p ())
4473         dump_printf_loc (MSG_NOTE, vect_location,
4474                          "reduction: detected reduction\n");
4475
4476       return def_stmt_info;
4477     }
4478
4479   if (dump_enabled_p ())
4480     dump_printf_loc (MSG_NOTE, vect_location,
4481                      "reduction: unknown pattern\n");
4482
4483   return NULL;
4484 }
4485
4486 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
4487    PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
4488    or -1 if not known.  */
4489
4490 static int
4491 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
4492 {
4493   int assumed_vf = vect_vf_for_cost (loop_vinfo);
4494   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
4495     {
4496       if (dump_enabled_p ())
4497         dump_printf_loc (MSG_NOTE, vect_location,
4498                          "cost model: epilogue peel iters set to vf/2 "
4499                          "because loop iterations are unknown .\n");
4500       return assumed_vf / 2;
4501     }
4502   else
4503     {
4504       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
4505       peel_iters_prologue = MIN (niters, peel_iters_prologue);
4506       int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
4507       /* If we need to peel for gaps, but no peeling is required, we have to
4508          peel VF iterations.  */
4509       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
4510         peel_iters_epilogue = assumed_vf;
4511       return peel_iters_epilogue;
4512     }
4513 }
4514
4515 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
4516 int
4517 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
4518                              int *peel_iters_epilogue,
4519                              stmt_vector_for_cost *scalar_cost_vec,
4520                              stmt_vector_for_cost *prologue_cost_vec,
4521                              stmt_vector_for_cost *epilogue_cost_vec)
4522 {
4523   int retval = 0;
4524
4525   *peel_iters_epilogue
4526     = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
4527
4528   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
4529     {
4530       /* If peeled iterations are known but number of scalar loop
4531          iterations are unknown, count a taken branch per peeled loop.  */
4532       if (peel_iters_prologue > 0)
4533         retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
4534                                    vect_prologue);
4535       if (*peel_iters_epilogue > 0)
4536         retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
4537                                     vect_epilogue);
4538     }
4539
4540   stmt_info_for_cost *si;
4541   int j;
4542   if (peel_iters_prologue)
4543     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4544       retval += record_stmt_cost (prologue_cost_vec,
4545                                   si->count * peel_iters_prologue,
4546                                   si->kind, si->stmt_info, si->misalign,
4547                                   vect_prologue);
4548   if (*peel_iters_epilogue)
4549     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4550       retval += record_stmt_cost (epilogue_cost_vec,
4551                                   si->count * *peel_iters_epilogue,
4552                                   si->kind, si->stmt_info, si->misalign,
4553                                   vect_epilogue);
4554
4555   return retval;
4556 }
4557
4558 /* Function vect_estimate_min_profitable_iters
4559
4560    Return the number of iterations required for the vector version of the
4561    loop to be profitable relative to the cost of the scalar version of the
4562    loop.
4563
4564    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
4565    of iterations for vectorization.  -1 value means loop vectorization
4566    is not profitable.  This returned value may be used for dynamic
4567    profitability check.
4568
4569    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
4570    for static check against estimated number of iterations.  */
4571
4572 static void
4573 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
4574                                     int *ret_min_profitable_niters,
4575                                     int *ret_min_profitable_estimate,
4576                                     unsigned *suggested_unroll_factor)
4577 {
4578   int min_profitable_iters;
4579   int min_profitable_estimate;
4580   int peel_iters_prologue;
4581   int peel_iters_epilogue;
4582   unsigned vec_inside_cost = 0;
4583   int vec_outside_cost = 0;
4584   unsigned vec_prologue_cost = 0;
4585   unsigned vec_epilogue_cost = 0;
4586   int scalar_single_iter_cost = 0;
4587   int scalar_outside_cost = 0;
4588   int assumed_vf = vect_vf_for_cost (loop_vinfo);
4589   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
4590   vector_costs *target_cost_data = loop_vinfo->vector_costs;
4591
4592   /* Cost model disabled.  */
4593   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
4594     {
4595       if (dump_enabled_p ())
4596         dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
4597       *ret_min_profitable_niters = 0;
4598       *ret_min_profitable_estimate = 0;
4599       return;
4600     }
4601
4602   /* Requires loop versioning tests to handle misalignment.  */
4603   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
4604     {
4605       /*  FIXME: Make cost depend on complexity of individual check.  */
4606       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
4607       (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4608       if (dump_enabled_p ())
4609         dump_printf (MSG_NOTE,
4610                      "cost model: Adding cost of checks for loop "
4611                      "versioning to treat misalignment.\n");
4612     }
4613
4614   /* Requires loop versioning with alias checks.  */
4615   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4616     {
4617       /*  FIXME: Make cost depend on complexity of individual check.  */
4618       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
4619       (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4620       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
4621       if (len)
4622         /* Count LEN - 1 ANDs and LEN comparisons.  */
4623         (void) add_stmt_cost (target_cost_data, len * 2 - 1,
4624                               scalar_stmt, vect_prologue);
4625       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
4626       if (len)
4627         {
4628           /* Count LEN - 1 ANDs and LEN comparisons.  */
4629           unsigned int nstmts = len * 2 - 1;
4630           /* +1 for each bias that needs adding.  */
4631           for (unsigned int i = 0; i < len; ++i)
4632             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
4633               nstmts += 1;
4634           (void) add_stmt_cost (target_cost_data, nstmts,
4635                                 scalar_stmt, vect_prologue);
4636         }
4637       if (dump_enabled_p ())
4638         dump_printf (MSG_NOTE,
4639                      "cost model: Adding cost of checks for loop "
4640                      "versioning aliasing.\n");
4641     }
4642
4643   /* Requires loop versioning with niter checks.  */
4644   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
4645     {
4646       /*  FIXME: Make cost depend on complexity of individual check.  */
4647       (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
4648                             NULL, NULL, NULL_TREE, 0, vect_prologue);
4649       if (dump_enabled_p ())
4650         dump_printf (MSG_NOTE,
4651                      "cost model: Adding cost of checks for loop "
4652                      "versioning niters.\n");
4653     }
4654
4655   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4656     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4657                           vect_prologue);
4658
4659   /* Count statements in scalar loop.  Using this as scalar cost for a single
4660      iteration for now.
4661
4662      TODO: Add outer loop support.
4663
4664      TODO: Consider assigning different costs to different scalar
4665      statements.  */
4666
4667   scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost ();
4668
4669   /* Add additional cost for the peeled instructions in prologue and epilogue
4670      loop.  (For fully-masked loops there will be no peeling.)
4671
4672      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4673      at compile-time - we assume it's vf/2 (the worst would be vf-1).
4674
4675      TODO: Build an expression that represents peel_iters for prologue and
4676      epilogue to be used in a run-time test.  */
4677
4678   bool prologue_need_br_taken_cost = false;
4679   bool prologue_need_br_not_taken_cost = false;
4680
4681   /* Calculate peel_iters_prologue.  */
4682   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4683     peel_iters_prologue = 0;
4684   else if (npeel < 0)
4685     {
4686       peel_iters_prologue = assumed_vf / 2;
4687       if (dump_enabled_p ())
4688         dump_printf (MSG_NOTE, "cost model: "
4689                      "prologue peel iters set to vf/2.\n");
4690
4691       /* If peeled iterations are unknown, count a taken branch and a not taken
4692          branch per peeled loop.  Even if scalar loop iterations are known,
4693          vector iterations are not known since peeled prologue iterations are
4694          not known.  Hence guards remain the same.  */
4695       prologue_need_br_taken_cost = true;
4696       prologue_need_br_not_taken_cost = true;
4697     }
4698   else
4699     {
4700       peel_iters_prologue = npeel;
4701       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4702         /* If peeled iterations are known but number of scalar loop
4703            iterations are unknown, count a taken branch per peeled loop.  */
4704         prologue_need_br_taken_cost = true;
4705     }
4706
4707   bool epilogue_need_br_taken_cost = false;
4708   bool epilogue_need_br_not_taken_cost = false;
4709
4710   /* Calculate peel_iters_epilogue.  */
4711   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4712     /* We need to peel exactly one iteration for gaps.  */
4713     peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4714   else if (npeel < 0)
4715     {
4716       /* If peeling for alignment is unknown, loop bound of main loop
4717          becomes unknown.  */
4718       peel_iters_epilogue = assumed_vf / 2;
4719       if (dump_enabled_p ())
4720         dump_printf (MSG_NOTE, "cost model: "
4721                      "epilogue peel iters set to vf/2 because "
4722                      "peeling for alignment is unknown.\n");
4723
4724       /* See the same reason above in peel_iters_prologue calculation.  */
4725       epilogue_need_br_taken_cost = true;
4726       epilogue_need_br_not_taken_cost = true;
4727     }
4728   else
4729     {
4730       peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4731       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4732         /* If peeled iterations are known but number of scalar loop
4733            iterations are unknown, count a taken branch per peeled loop.  */
4734         epilogue_need_br_taken_cost = true;
4735     }
4736
4737   stmt_info_for_cost *si;
4738   int j;
4739   /* Add costs associated with peel_iters_prologue.  */
4740   if (peel_iters_prologue)
4741     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4742       {
4743         (void) add_stmt_cost (target_cost_data,
4744                               si->count * peel_iters_prologue, si->kind,
4745                               si->stmt_info, si->node, si->vectype,
4746                               si->misalign, vect_prologue);
4747       }
4748
4749   /* Add costs associated with peel_iters_epilogue.  */
4750   if (peel_iters_epilogue)
4751     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4752       {
4753         (void) add_stmt_cost (target_cost_data,
4754                               si->count * peel_iters_epilogue, si->kind,
4755                               si->stmt_info, si->node, si->vectype,
4756                               si->misalign, vect_epilogue);
4757       }
4758
4759   /* Add possible cond_branch_taken/cond_branch_not_taken cost.  */
4760
4761   if (prologue_need_br_taken_cost)
4762     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4763                           vect_prologue);
4764
4765   if (prologue_need_br_not_taken_cost)
4766     (void) add_stmt_cost (target_cost_data, 1,
4767                           cond_branch_not_taken, vect_prologue);
4768
4769   if (epilogue_need_br_taken_cost)
4770     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4771                           vect_epilogue);
4772
4773   if (epilogue_need_br_not_taken_cost)
4774     (void) add_stmt_cost (target_cost_data, 1,
4775                           cond_branch_not_taken, vect_epilogue);
4776
4777   /* Take care of special costs for rgroup controls of partial vectors.  */
4778   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4779       && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4780           == vect_partial_vectors_avx512))
4781     {
4782       /* Calculate how many masks we need to generate.  */
4783       unsigned int num_masks = 0;
4784       bool need_saturation = false;
4785       for (auto rgm : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
4786         if (rgm.type)
4787           {
4788             unsigned nvectors = rgm.factor;
4789             num_masks += nvectors;
4790             if (TYPE_PRECISION (TREE_TYPE (rgm.compare_type))
4791                 < TYPE_PRECISION (LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo)))
4792               need_saturation = true;
4793           }
4794
4795       /* ???  The target isn't able to identify the costs below as
4796          producing masks so it cannot penaltize cases where we'd run
4797          out of mask registers for example.  */
4798
4799       /* ???  We are also failing to account for smaller vector masks
4800          we generate by splitting larger masks in vect_get_loop_mask.  */
4801
4802       /* In the worst case, we need to generate each mask in the prologue
4803          and in the loop body.  We need one splat per group and one
4804          compare per mask.
4805
4806          Sometimes the prologue mask will fold to a constant,
4807          so the actual prologue cost might be smaller.  However, it's
4808          simpler and safer to use the worst-case cost; if this ends up
4809          being the tie-breaker between vectorizing or not, then it's
4810          probably better not to vectorize.  */
4811       (void) add_stmt_cost (target_cost_data,
4812                             num_masks
4813                             + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4814                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4815                             vect_prologue);
4816       (void) add_stmt_cost (target_cost_data,
4817                             num_masks
4818                             + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4819                             vector_stmt, NULL, NULL, NULL_TREE, 0, vect_body);
4820
4821       /* When we need saturation we need it both in the prologue and
4822          the epilogue.  */
4823       if (need_saturation)
4824         {
4825           (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4826                                 NULL, NULL, NULL_TREE, 0, vect_prologue);
4827           (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4828                                 NULL, NULL, NULL_TREE, 0, vect_body);
4829         }
4830     }
4831   else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4832            && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4833                == vect_partial_vectors_while_ult))
4834     {
4835       /* Calculate how many masks we need to generate.  */
4836       unsigned int num_masks = 0;
4837       rgroup_controls *rgm;
4838       unsigned int num_vectors_m1;
4839       FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec,
4840                         num_vectors_m1, rgm)
4841         if (rgm->type)
4842           num_masks += num_vectors_m1 + 1;
4843       gcc_assert (num_masks > 0);
4844
4845       /* In the worst case, we need to generate each mask in the prologue
4846          and in the loop body.  One of the loop body mask instructions
4847          replaces the comparison in the scalar loop, and since we don't
4848          count the scalar comparison against the scalar body, we shouldn't
4849          count that vector instruction against the vector body either.
4850
4851          Sometimes we can use unpacks instead of generating prologue
4852          masks and sometimes the prologue mask will fold to a constant,
4853          so the actual prologue cost might be smaller.  However, it's
4854          simpler and safer to use the worst-case cost; if this ends up
4855          being the tie-breaker between vectorizing or not, then it's
4856          probably better not to vectorize.  */
4857       (void) add_stmt_cost (target_cost_data, num_masks,
4858                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4859                             vect_prologue);
4860       (void) add_stmt_cost (target_cost_data, num_masks - 1,
4861                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4862                             vect_body);
4863     }
4864   else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4865     {
4866       /* Referring to the functions vect_set_loop_condition_partial_vectors
4867          and vect_set_loop_controls_directly, we need to generate each
4868          length in the prologue and in the loop body if required. Although
4869          there are some possible optimizations, we consider the worst case
4870          here.  */
4871
4872       bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4873       signed char partial_load_store_bias
4874         = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4875       bool need_iterate_p
4876         = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4877            && !vect_known_niters_smaller_than_vf (loop_vinfo));
4878
4879       /* Calculate how many statements to be added.  */
4880       unsigned int prologue_stmts = 0;
4881       unsigned int body_stmts = 0;
4882
4883       rgroup_controls *rgc;
4884       unsigned int num_vectors_m1;
4885       FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4886         if (rgc->type)
4887           {
4888             /* May need one SHIFT for nitems_total computation.  */
4889             unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4890             if (nitems != 1 && !niters_known_p)
4891               prologue_stmts += 1;
4892
4893             /* May need one MAX and one MINUS for wrap around.  */
4894             if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4895               prologue_stmts += 2;
4896
4897             /* Need one MAX and one MINUS for each batch limit excepting for
4898                the 1st one.  */
4899             prologue_stmts += num_vectors_m1 * 2;
4900
4901             unsigned int num_vectors = num_vectors_m1 + 1;
4902
4903             /* Need to set up lengths in prologue, only one MIN required
4904                for each since start index is zero.  */
4905             prologue_stmts += num_vectors;
4906
4907             /* If we have a non-zero partial load bias, we need one PLUS
4908                to adjust the load length.  */
4909             if (partial_load_store_bias != 0)
4910               body_stmts += 1;
4911
4912             unsigned int length_update_cost = 0;
4913             if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
4914               /* For decrement IV style, Each only need a single SELECT_VL
4915                  or MIN since beginning to calculate the number of elements
4916                  need to be processed in current iteration.  */
4917               length_update_cost = 1;
4918             else
4919               /* For increment IV stype, Each may need two MINs and one MINUS to
4920                  update lengths in body for next iteration.  */
4921               length_update_cost = 3;
4922
4923             if (need_iterate_p)
4924               body_stmts += length_update_cost * num_vectors;
4925           }
4926
4927       (void) add_stmt_cost (target_cost_data, prologue_stmts,
4928                             scalar_stmt, vect_prologue);
4929       (void) add_stmt_cost (target_cost_data, body_stmts,
4930                             scalar_stmt, vect_body);
4931     }
4932
4933   /* FORNOW: The scalar outside cost is incremented in one of the
4934      following ways:
4935
4936      1. The vectorizer checks for alignment and aliasing and generates
4937      a condition that allows dynamic vectorization.  A cost model
4938      check is ANDED with the versioning condition.  Hence scalar code
4939      path now has the added cost of the versioning check.
4940
4941        if (cost > th & versioning_check)
4942          jmp to vector code
4943
4944      Hence run-time scalar is incremented by not-taken branch cost.
4945
4946      2. The vectorizer then checks if a prologue is required.  If the
4947      cost model check was not done before during versioning, it has to
4948      be done before the prologue check.
4949
4950        if (cost <= th)
4951          prologue = scalar_iters
4952        if (prologue == 0)
4953          jmp to vector code
4954        else
4955          execute prologue
4956        if (prologue == num_iters)
4957          go to exit
4958
4959      Hence the run-time scalar cost is incremented by a taken branch,
4960      plus a not-taken branch, plus a taken branch cost.
4961
4962      3. The vectorizer then checks if an epilogue is required.  If the
4963      cost model check was not done before during prologue check, it
4964      has to be done with the epilogue check.
4965
4966        if (prologue == 0)
4967          jmp to vector code
4968        else
4969          execute prologue
4970        if (prologue == num_iters)
4971          go to exit
4972        vector code:
4973          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4974            jmp to epilogue
4975
4976      Hence the run-time scalar cost should be incremented by 2 taken
4977      branches.
4978
4979      TODO: The back end may reorder the BBS's differently and reverse
4980      conditions/branch directions.  Change the estimates below to
4981      something more reasonable.  */
4982
4983   /* If the number of iterations is known and we do not do versioning, we can
4984      decide whether to vectorize at compile time.  Hence the scalar version
4985      do not carry cost model guard costs.  */
4986   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4987       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4988     {
4989       /* Cost model check occurs at versioning.  */
4990       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4991         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4992       else
4993         {
4994           /* Cost model check occurs at prologue generation.  */
4995           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4996             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4997               + vect_get_stmt_cost (cond_branch_not_taken);
4998           /* Cost model check occurs at epilogue generation.  */
4999           else
5000             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
5001         }
5002     }
5003
5004   /* Complete the target-specific cost calculations.  */
5005   finish_cost (loop_vinfo->vector_costs, loop_vinfo->scalar_costs,
5006                &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost,
5007                suggested_unroll_factor);
5008
5009   if (suggested_unroll_factor && *suggested_unroll_factor > 1
5010       && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
5011       && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
5012                     *suggested_unroll_factor,
5013                     LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
5014     {
5015       if (dump_enabled_p ())
5016         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5017                          "can't unroll as unrolled vectorization factor larger"
5018                          " than maximum vectorization factor: "
5019                          HOST_WIDE_INT_PRINT_UNSIGNED "\n",
5020                          LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
5021       *suggested_unroll_factor = 1;
5022     }
5023
5024   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
5025
5026   if (dump_enabled_p ())
5027     {
5028       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
5029       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
5030                    vec_inside_cost);
5031       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
5032                    vec_prologue_cost);
5033       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
5034                    vec_epilogue_cost);
5035       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
5036                    scalar_single_iter_cost);
5037       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
5038                    scalar_outside_cost);
5039       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
5040                    vec_outside_cost);
5041       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
5042                    peel_iters_prologue);
5043       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
5044                    peel_iters_epilogue);
5045     }
5046
5047   /* Calculate number of iterations required to make the vector version
5048      profitable, relative to the loop bodies only.  The following condition
5049      must hold true:
5050      SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
5051      where
5052      SIC = scalar iteration cost, VIC = vector iteration cost,
5053      VOC = vector outside cost, VF = vectorization factor,
5054      NPEEL = prologue iterations + epilogue iterations,
5055      SOC = scalar outside cost for run time cost model check.  */
5056
5057   int saving_per_viter = (scalar_single_iter_cost * assumed_vf
5058                           - vec_inside_cost);
5059   if (saving_per_viter <= 0)
5060     {
5061       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
5062         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
5063                     "vectorization did not happen for a simd loop");
5064
5065       if (dump_enabled_p ())
5066         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5067                          "cost model: the vector iteration cost = %d "
5068                          "divided by the scalar iteration cost = %d "
5069                          "is greater or equal to the vectorization factor = %d"
5070                          ".\n",
5071                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
5072       *ret_min_profitable_niters = -1;
5073       *ret_min_profitable_estimate = -1;
5074       return;
5075     }
5076
5077   /* ??? The "if" arm is written to handle all cases; see below for what
5078      we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
5079   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5080     {
5081       /* Rewriting the condition above in terms of the number of
5082          vector iterations (vniters) rather than the number of
5083          scalar iterations (niters) gives:
5084
5085          SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
5086
5087          <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
5088
5089          For integer N, X and Y when X > 0:
5090
5091          N * X > Y <==> N >= (Y /[floor] X) + 1.  */
5092       int outside_overhead = (vec_outside_cost
5093                               - scalar_single_iter_cost * peel_iters_prologue
5094                               - scalar_single_iter_cost * peel_iters_epilogue
5095                               - scalar_outside_cost);
5096       /* We're only interested in cases that require at least one
5097          vector iteration.  */
5098       int min_vec_niters = 1;
5099       if (outside_overhead > 0)
5100         min_vec_niters = outside_overhead / saving_per_viter + 1;
5101
5102       if (dump_enabled_p ())
5103         dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
5104                      min_vec_niters);
5105
5106       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5107         {
5108           /* Now that we know the minimum number of vector iterations,
5109              find the minimum niters for which the scalar cost is larger:
5110
5111              SIC * niters > VIC * vniters + VOC - SOC
5112
5113              We know that the minimum niters is no more than
5114              vniters * VF + NPEEL, but it might be (and often is) less
5115              than that if a partial vector iteration is cheaper than the
5116              equivalent scalar code.  */
5117           int threshold = (vec_inside_cost * min_vec_niters
5118                            + vec_outside_cost
5119                            - scalar_outside_cost);
5120           if (threshold <= 0)
5121             min_profitable_iters = 1;
5122           else
5123             min_profitable_iters = threshold / scalar_single_iter_cost + 1;
5124         }
5125       else
5126         /* Convert the number of vector iterations into a number of
5127            scalar iterations.  */
5128         min_profitable_iters = (min_vec_niters * assumed_vf
5129                                 + peel_iters_prologue
5130                                 + peel_iters_epilogue);
5131     }
5132   else
5133     {
5134       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
5135                               * assumed_vf
5136                               - vec_inside_cost * peel_iters_prologue
5137                               - vec_inside_cost * peel_iters_epilogue);
5138       if (min_profitable_iters <= 0)
5139         min_profitable_iters = 0;
5140       else
5141         {
5142           min_profitable_iters /= saving_per_viter;
5143
5144           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
5145               <= (((int) vec_inside_cost * min_profitable_iters)
5146                   + (((int) vec_outside_cost - scalar_outside_cost)
5147                      * assumed_vf)))
5148             min_profitable_iters++;
5149         }
5150     }
5151
5152   if (dump_enabled_p ())
5153     dump_printf (MSG_NOTE,
5154                  "  Calculated minimum iters for profitability: %d\n",
5155                  min_profitable_iters);
5156
5157   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
5158       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
5159     /* We want the vectorized loop to execute at least once.  */
5160     min_profitable_iters = assumed_vf + peel_iters_prologue;
5161   else if (min_profitable_iters < peel_iters_prologue)
5162     /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
5163        vectorized loop executes at least once.  */
5164     min_profitable_iters = peel_iters_prologue;
5165
5166   if (dump_enabled_p ())
5167     dump_printf_loc (MSG_NOTE, vect_location,
5168                      "  Runtime profitability threshold = %d\n",
5169                      min_profitable_iters);
5170
5171   *ret_min_profitable_niters = min_profitable_iters;
5172
5173   /* Calculate number of iterations required to make the vector version
5174      profitable, relative to the loop bodies only.
5175
5176      Non-vectorized variant is SIC * niters and it must win over vector
5177      variant on the expected loop trip count.  The following condition must hold true:
5178      SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
5179
5180   if (vec_outside_cost <= 0)
5181     min_profitable_estimate = 0;
5182   /* ??? This "else if" arm is written to handle all cases; see below for
5183      what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
5184   else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5185     {
5186       /* This is a repeat of the code above, but with + SOC rather
5187          than - SOC.  */
5188       int outside_overhead = (vec_outside_cost
5189                               - scalar_single_iter_cost * peel_iters_prologue
5190                               - scalar_single_iter_cost * peel_iters_epilogue
5191                               + scalar_outside_cost);
5192       int min_vec_niters = 1;
5193       if (outside_overhead > 0)
5194         min_vec_niters = outside_overhead / saving_per_viter + 1;
5195
5196       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5197         {
5198           int threshold = (vec_inside_cost * min_vec_niters
5199                            + vec_outside_cost
5200                            + scalar_outside_cost);
5201           min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
5202         }
5203       else
5204         min_profitable_estimate = (min_vec_niters * assumed_vf
5205                                    + peel_iters_prologue
5206                                    + peel_iters_epilogue);
5207     }
5208   else
5209     {
5210       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
5211                                  * assumed_vf
5212                                  - vec_inside_cost * peel_iters_prologue
5213                                  - vec_inside_cost * peel_iters_epilogue)
5214                                  / ((scalar_single_iter_cost * assumed_vf)
5215                                    - vec_inside_cost);
5216     }
5217   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
5218   if (dump_enabled_p ())
5219     dump_printf_loc (MSG_NOTE, vect_location,
5220                      "  Static estimate profitability threshold = %d\n",
5221                      min_profitable_estimate);
5222
5223   *ret_min_profitable_estimate = min_profitable_estimate;
5224 }
5225
5226 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
5227    vector elements (not bits) for a vector with NELT elements.  */
5228 static void
5229 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
5230                               vec_perm_builder *sel)
5231 {
5232   /* The encoding is a single stepped pattern.  Any wrap-around is handled
5233      by vec_perm_indices.  */
5234   sel->new_vector (nelt, 1, 3);
5235   for (unsigned int i = 0; i < 3; i++)
5236     sel->quick_push (i + offset);
5237 }
5238
5239 /* Checks whether the target supports whole-vector shifts for vectors of mode
5240    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
5241    it supports vec_perm_const with masks for all necessary shift amounts.  */
5242 static bool
5243 have_whole_vector_shift (machine_mode mode)
5244 {
5245   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
5246     return true;
5247
5248   /* Variable-length vectors should be handled via the optab.  */
5249   unsigned int nelt;
5250   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5251     return false;
5252
5253   vec_perm_builder sel;
5254   vec_perm_indices indices;
5255   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
5256     {
5257       calc_vec_perm_mask_for_shift (i, nelt, &sel);
5258       indices.new_vector (sel, 2, nelt);
5259       if (!can_vec_perm_const_p (mode, mode, indices, false))
5260         return false;
5261     }
5262   return true;
5263 }
5264
5265 /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
5266    multiplication operands have differing signs and (b) we intend
5267    to emulate the operation using a series of signed DOT_PROD_EXPRs.
5268    See vect_emulate_mixed_dot_prod for the actual sequence used.  */
5269
5270 static bool
5271 vect_is_emulated_mixed_dot_prod (loop_vec_info loop_vinfo,
5272                                  stmt_vec_info stmt_info)
5273 {
5274   gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
5275   if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
5276     return false;
5277
5278   tree rhs1 = gimple_assign_rhs1 (assign);
5279   tree rhs2 = gimple_assign_rhs2 (assign);
5280   if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
5281     return false;
5282
5283   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5284   gcc_assert (reduc_info->is_reduc_info);
5285   return !directly_supported_p (DOT_PROD_EXPR,
5286                                 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info),
5287                                 optab_vector_mixed_sign);
5288 }
5289
5290 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
5291    functions. Design better to avoid maintenance issues.  */
5292
5293 /* Function vect_model_reduction_cost.
5294
5295    Models cost for a reduction operation, including the vector ops
5296    generated within the strip-mine loop in some cases, the initial
5297    definition before the loop, and the epilogue code that must be generated.  */
5298
5299 static void
5300 vect_model_reduction_cost (loop_vec_info loop_vinfo,
5301                            stmt_vec_info stmt_info, internal_fn reduc_fn,
5302                            vect_reduction_type reduction_type,
5303                            int ncopies, stmt_vector_for_cost *cost_vec)
5304 {
5305   int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
5306   tree vectype;
5307   machine_mode mode;
5308   class loop *loop = NULL;
5309
5310   if (loop_vinfo)
5311     loop = LOOP_VINFO_LOOP (loop_vinfo);
5312
5313   /* Condition reductions generate two reductions in the loop.  */
5314   if (reduction_type == COND_REDUCTION)
5315     ncopies *= 2;
5316
5317   vectype = STMT_VINFO_VECTYPE (stmt_info);
5318   mode = TYPE_MODE (vectype);
5319   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5320
5321   gimple_match_op op;
5322   if (!gimple_extract_op (orig_stmt_info->stmt, &op))
5323     gcc_unreachable ();
5324
5325   bool emulated_mixed_dot_prod
5326     = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
5327   if (reduction_type == EXTRACT_LAST_REDUCTION)
5328     /* No extra instructions are needed in the prologue.  The loop body
5329        operations are costed in vectorizable_condition.  */
5330     inside_cost = 0;
5331   else if (reduction_type == FOLD_LEFT_REDUCTION)
5332     {
5333       /* No extra instructions needed in the prologue.  */
5334       prologue_cost = 0;
5335
5336       if (reduc_fn != IFN_LAST)
5337         /* Count one reduction-like operation per vector.  */
5338         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
5339                                         stmt_info, 0, vect_body);
5340       else
5341         {
5342           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
5343           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
5344           inside_cost = record_stmt_cost (cost_vec, nelements,
5345                                           vec_to_scalar, stmt_info, 0,
5346                                           vect_body);
5347           inside_cost += record_stmt_cost (cost_vec, nelements,
5348                                            scalar_stmt, stmt_info, 0,
5349                                            vect_body);
5350         }
5351     }
5352   else
5353     {
5354       /* Add in the cost of the initial definitions.  */
5355       int prologue_stmts;
5356       if (reduction_type == COND_REDUCTION)
5357         /* For cond reductions we have four vectors: initial index, step,
5358            initial result of the data reduction, initial value of the index
5359            reduction.  */
5360         prologue_stmts = 4;
5361       else if (emulated_mixed_dot_prod)
5362         /* We need the initial reduction value and two invariants:
5363            one that contains the minimum signed value and one that
5364            contains half of its negative.  */
5365         prologue_stmts = 3;
5366       else
5367         prologue_stmts = 1;
5368       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
5369                                          scalar_to_vec, stmt_info, 0,
5370                                          vect_prologue);
5371     }
5372
5373   /* Determine cost of epilogue code.
5374
5375      We have a reduction operator that will reduce the vector in one statement.
5376      Also requires scalar extract.  */
5377
5378   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
5379     {
5380       if (reduc_fn != IFN_LAST)
5381         {
5382           if (reduction_type == COND_REDUCTION)
5383             {
5384               /* An EQ stmt and an COND_EXPR stmt.  */
5385               epilogue_cost += record_stmt_cost (cost_vec, 2,
5386                                                  vector_stmt, stmt_info, 0,
5387                                                  vect_epilogue);
5388               /* Reduction of the max index and a reduction of the found
5389                  values.  */
5390               epilogue_cost += record_stmt_cost (cost_vec, 2,
5391                                                  vec_to_scalar, stmt_info, 0,
5392                                                  vect_epilogue);
5393               /* A broadcast of the max value.  */
5394               epilogue_cost += record_stmt_cost (cost_vec, 1,
5395                                                  scalar_to_vec, stmt_info, 0,
5396                                                  vect_epilogue);
5397             }
5398           else
5399             {
5400               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
5401                                                  stmt_info, 0, vect_epilogue);
5402               epilogue_cost += record_stmt_cost (cost_vec, 1,
5403                                                  vec_to_scalar, stmt_info, 0,
5404                                                  vect_epilogue);
5405             }
5406         }
5407       else if (reduction_type == COND_REDUCTION)
5408         {
5409           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
5410           /* Extraction of scalar elements.  */
5411           epilogue_cost += record_stmt_cost (cost_vec,
5412                                              2 * estimated_nunits,
5413                                              vec_to_scalar, stmt_info, 0,
5414                                              vect_epilogue);
5415           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
5416           epilogue_cost += record_stmt_cost (cost_vec,
5417                                              2 * estimated_nunits - 3,
5418                                              scalar_stmt, stmt_info, 0,
5419                                              vect_epilogue);
5420         }
5421       else if (reduction_type == EXTRACT_LAST_REDUCTION
5422                || reduction_type == FOLD_LEFT_REDUCTION)
5423         /* No extra instructions need in the epilogue.  */
5424         ;
5425       else
5426         {
5427           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5428           tree bitsize = TYPE_SIZE (op.type);
5429           int element_bitsize = tree_to_uhwi (bitsize);
5430           int nelements = vec_size_in_bits / element_bitsize;
5431
5432           if (op.code == COND_EXPR)
5433             op.code = MAX_EXPR;
5434
5435           /* We have a whole vector shift available.  */
5436           if (VECTOR_MODE_P (mode)
5437               && directly_supported_p (op.code, vectype)
5438               && have_whole_vector_shift (mode))
5439             {
5440               /* Final reduction via vector shifts and the reduction operator.
5441                  Also requires scalar extract.  */
5442               epilogue_cost += record_stmt_cost (cost_vec,
5443                                                  exact_log2 (nelements) * 2,
5444                                                  vector_stmt, stmt_info, 0,
5445                                                  vect_epilogue);
5446               epilogue_cost += record_stmt_cost (cost_vec, 1,
5447                                                  vec_to_scalar, stmt_info, 0,
5448                                                  vect_epilogue);
5449             }
5450           else
5451             /* Use extracts and reduction op for final reduction.  For N
5452                elements, we have N extracts and N-1 reduction ops.  */
5453             epilogue_cost += record_stmt_cost (cost_vec,
5454                                                nelements + nelements - 1,
5455                                                vector_stmt, stmt_info, 0,
5456                                                vect_epilogue);
5457         }
5458     }
5459
5460   if (dump_enabled_p ())
5461     dump_printf (MSG_NOTE,
5462                  "vect_model_reduction_cost: inside_cost = %d, "
5463                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
5464                  prologue_cost, epilogue_cost);
5465 }
5466
5467 /* SEQ is a sequence of instructions that initialize the reduction
5468    described by REDUC_INFO.  Emit them in the appropriate place.  */
5469
5470 static void
5471 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
5472                                 stmt_vec_info reduc_info, gimple *seq)
5473 {
5474   if (reduc_info->reused_accumulator)
5475     {
5476       /* When reusing an accumulator from the main loop, we only need
5477          initialization instructions if the main loop can be skipped.
5478          In that case, emit the initialization instructions at the end
5479          of the guard block that does the skip.  */
5480       edge skip_edge = loop_vinfo->skip_main_loop_edge;
5481       gcc_assert (skip_edge);
5482       gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
5483       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
5484     }
5485   else
5486     {
5487       /* The normal case: emit the initialization instructions on the
5488          preheader edge.  */
5489       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5490       gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
5491     }
5492 }
5493
5494 /* Function get_initial_def_for_reduction
5495
5496    Input:
5497    REDUC_INFO - the info_for_reduction
5498    INIT_VAL - the initial value of the reduction variable
5499    NEUTRAL_OP - a value that has no effect on the reduction, as per
5500                 neutral_op_for_reduction
5501
5502    Output:
5503    Return a vector variable, initialized according to the operation that
5504         STMT_VINFO performs. This vector will be used as the initial value
5505         of the vector of partial results.
5506
5507    The value we need is a vector in which element 0 has value INIT_VAL
5508    and every other element has value NEUTRAL_OP.  */
5509
5510 static tree
5511 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
5512                                stmt_vec_info reduc_info,
5513                                tree init_val, tree neutral_op)
5514 {
5515   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5516   tree scalar_type = TREE_TYPE (init_val);
5517   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
5518   tree init_def;
5519   gimple_seq stmts = NULL;
5520
5521   gcc_assert (vectype);
5522
5523   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
5524               || SCALAR_FLOAT_TYPE_P (scalar_type));
5525
5526   gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
5527               || loop == (gimple_bb (reduc_info->stmt))->loop_father);
5528
5529   if (operand_equal_p (init_val, neutral_op))
5530     {
5531       /* If both elements are equal then the vector described above is
5532          just a splat.  */
5533       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5534       init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
5535     }
5536   else
5537     {
5538       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5539       init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
5540       if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
5541         {
5542           /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
5543              element 0.  */
5544           init_def = gimple_build_vector_from_val (&stmts, vectype,
5545                                                    neutral_op);
5546           init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
5547                                    vectype, init_def, init_val);
5548         }
5549       else
5550         {
5551           /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}.  */
5552           tree_vector_builder elts (vectype, 1, 2);
5553           elts.quick_push (init_val);
5554           elts.quick_push (neutral_op);
5555           init_def = gimple_build_vector (&stmts, &elts);
5556         }
5557     }
5558
5559   if (stmts)
5560     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
5561   return init_def;
5562 }
5563
5564 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
5565    which performs a reduction involving GROUP_SIZE scalar statements.
5566    NUMBER_OF_VECTORS is the number of vector defs to create.  If NEUTRAL_OP
5567    is nonnull, introducing extra elements of that value will not change the
5568    result.  */
5569
5570 static void
5571 get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
5572                                 stmt_vec_info reduc_info,
5573                                 vec<tree> *vec_oprnds,
5574                                 unsigned int number_of_vectors,
5575                                 unsigned int group_size, tree neutral_op)
5576 {
5577   vec<tree> &initial_values = reduc_info->reduc_initial_values;
5578   unsigned HOST_WIDE_INT nunits;
5579   unsigned j, number_of_places_left_in_vector;
5580   tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
5581   unsigned int i;
5582
5583   gcc_assert (group_size == initial_values.length () || neutral_op);
5584
5585   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5586      created vectors. It is greater than 1 if unrolling is performed.
5587
5588      For example, we have two scalar operands, s1 and s2 (e.g., group of
5589      strided accesses of size two), while NUNITS is four (i.e., four scalars
5590      of this type can be packed in a vector).  The output vector will contain
5591      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
5592      will be 2).
5593
5594      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
5595      vectors containing the operands.
5596
5597      For example, NUNITS is four as before, and the group size is 8
5598      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
5599      {s5, s6, s7, s8}.  */
5600
5601   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
5602     nunits = group_size;
5603
5604   number_of_places_left_in_vector = nunits;
5605   bool constant_p = true;
5606   tree_vector_builder elts (vector_type, nunits, 1);
5607   elts.quick_grow (nunits);
5608   gimple_seq ctor_seq = NULL;
5609   for (j = 0; j < nunits * number_of_vectors; ++j)
5610     {
5611       tree op;
5612       i = j % group_size;
5613
5614       /* Get the def before the loop.  In reduction chain we have only
5615          one initial value.  Else we have as many as PHIs in the group.  */
5616       if (i >= initial_values.length () || (j > i && neutral_op))
5617         op = neutral_op;
5618       else
5619         op = initial_values[i];
5620
5621       /* Create 'vect_ = {op0,op1,...,opn}'.  */
5622       number_of_places_left_in_vector--;
5623       elts[nunits - number_of_places_left_in_vector - 1] = op;
5624       if (!CONSTANT_CLASS_P (op))
5625         constant_p = false;
5626
5627       if (number_of_places_left_in_vector == 0)
5628         {
5629           tree init;
5630           if (constant_p && !neutral_op
5631               ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
5632               : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
5633             /* Build the vector directly from ELTS.  */
5634             init = gimple_build_vector (&ctor_seq, &elts);
5635           else if (neutral_op)
5636             {
5637               /* Build a vector of the neutral value and shift the
5638                  other elements into place.  */
5639               init = gimple_build_vector_from_val (&ctor_seq, vector_type,
5640                                                    neutral_op);
5641               int k = nunits;
5642               while (k > 0 && elts[k - 1] == neutral_op)
5643                 k -= 1;
5644               while (k > 0)
5645                 {
5646                   k -= 1;
5647                   init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
5648                                        vector_type, init, elts[k]);
5649                 }
5650             }
5651           else
5652             {
5653               /* First time round, duplicate ELTS to fill the
5654                  required number of vectors.  */
5655               duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
5656                                         elts, number_of_vectors, *vec_oprnds);
5657               break;
5658             }
5659           vec_oprnds->quick_push (init);
5660
5661           number_of_places_left_in_vector = nunits;
5662           elts.new_vector (vector_type, nunits, 1);
5663           elts.quick_grow (nunits);
5664           constant_p = true;
5665         }
5666     }
5667   if (ctor_seq != NULL)
5668     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
5669 }
5670
5671 /* For a statement STMT_INFO taking part in a reduction operation return
5672    the stmt_vec_info the meta information is stored on.  */
5673
5674 stmt_vec_info
5675 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
5676 {
5677   stmt_info = vect_orig_stmt (stmt_info);
5678   gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
5679   if (!is_a <gphi *> (stmt_info->stmt)
5680       || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
5681     stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5682   gphi *phi = as_a <gphi *> (stmt_info->stmt);
5683   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5684     {
5685       if (gimple_phi_num_args (phi) == 1)
5686         stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5687     }
5688   else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
5689     {
5690       stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
5691       if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
5692         stmt_info = info;
5693     }
5694   return stmt_info;
5695 }
5696
5697 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
5698    REDUC_INFO can build on.  Adjust REDUC_INFO and return true if so, otherwise
5699    return false.  */
5700
5701 static bool
5702 vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
5703                                 stmt_vec_info reduc_info)
5704 {
5705   loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
5706   if (!main_loop_vinfo)
5707     return false;
5708
5709   if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
5710     return false;
5711
5712   unsigned int num_phis = reduc_info->reduc_initial_values.length ();
5713   auto_vec<tree, 16> main_loop_results (num_phis);
5714   auto_vec<tree, 16> initial_values (num_phis);
5715   if (edge main_loop_edge = loop_vinfo->main_loop_edge)
5716     {
5717       /* The epilogue loop can be entered either from the main loop or
5718          from an earlier guard block.  */
5719       edge skip_edge = loop_vinfo->skip_main_loop_edge;
5720       for (tree incoming_value : reduc_info->reduc_initial_values)
5721         {
5722           /* Look for:
5723
5724                INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
5725                                     INITIAL_VALUE(guard block)>.  */
5726           gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
5727
5728           gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
5729           gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
5730
5731           tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
5732           tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
5733
5734           main_loop_results.quick_push (from_main_loop);
5735           initial_values.quick_push (from_skip);
5736         }
5737     }
5738   else
5739     /* The main loop dominates the epilogue loop.  */
5740     main_loop_results.splice (reduc_info->reduc_initial_values);
5741
5742   /* See if the main loop has the kind of accumulator we need.  */
5743   vect_reusable_accumulator *accumulator
5744     = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
5745   if (!accumulator
5746       || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
5747       || !std::equal (main_loop_results.begin (), main_loop_results.end (),
5748                       accumulator->reduc_info->reduc_scalar_results.begin ()))
5749     return false;
5750
5751   /* Handle the case where we can reduce wider vectors to narrower ones.  */
5752   tree vectype = STMT_VINFO_VECTYPE (reduc_info);
5753   tree old_vectype = TREE_TYPE (accumulator->reduc_input);
5754   unsigned HOST_WIDE_INT m;
5755   if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
5756                             TYPE_VECTOR_SUBPARTS (vectype), &m))
5757     return false;
5758   /* Check the intermediate vector types and operations are available.  */
5759   tree prev_vectype = old_vectype;
5760   poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
5761   while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
5762     {
5763       intermediate_nunits = exact_div (intermediate_nunits, 2);
5764       tree intermediate_vectype = get_related_vectype_for_scalar_type
5765         (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
5766       if (!intermediate_vectype
5767           || !directly_supported_p (STMT_VINFO_REDUC_CODE (reduc_info),
5768                                     intermediate_vectype)
5769           || !can_vec_extract (TYPE_MODE (prev_vectype),
5770                                TYPE_MODE (intermediate_vectype)))
5771         return false;
5772       prev_vectype = intermediate_vectype;
5773     }
5774
5775   /* Non-SLP reductions might apply an adjustment after the reduction
5776      operation, in order to simplify the initialization of the accumulator.
5777      If the epilogue loop carries on from where the main loop left off,
5778      it should apply the same adjustment to the final reduction result.
5779
5780      If the epilogue loop can also be entered directly (rather than via
5781      the main loop), we need to be able to handle that case in the same way,
5782      with the same adjustment.  (In principle we could add a PHI node
5783      to select the correct adjustment, but in practice that shouldn't be
5784      necessary.)  */
5785   tree main_adjustment
5786     = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5787   if (loop_vinfo->main_loop_edge && main_adjustment)
5788     {
5789       gcc_assert (num_phis == 1);
5790       tree initial_value = initial_values[0];
5791       /* Check that we can use INITIAL_VALUE as the adjustment and
5792          initialize the accumulator with a neutral value instead.  */
5793       if (!operand_equal_p (initial_value, main_adjustment))
5794         return false;
5795       code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5796       initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
5797                                                     code, initial_value);
5798     }
5799   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5800   reduc_info->reduc_initial_values.truncate (0);
5801   reduc_info->reduc_initial_values.splice (initial_values);
5802   reduc_info->reused_accumulator = accumulator;
5803   return true;
5804 }
5805
5806 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5807    CODE emitting stmts before GSI.  Returns a vector def of VECTYPE.  */
5808
5809 static tree
5810 vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5811                             gimple_seq *seq)
5812 {
5813   unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5814   unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5815   tree stype = TREE_TYPE (vectype);
5816   tree new_temp = vec_def;
5817   while (nunits > nunits1)
5818     {
5819       nunits /= 2;
5820       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5821                                                            stype, nunits);
5822       unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5823
5824       /* The target has to make sure we support lowpart/highpart
5825          extraction, either via direct vector extract or through
5826          an integer mode punning.  */
5827       tree dst1, dst2;
5828       gimple *epilog_stmt;
5829       if (convert_optab_handler (vec_extract_optab,
5830                                  TYPE_MODE (TREE_TYPE (new_temp)),
5831                                  TYPE_MODE (vectype1))
5832           != CODE_FOR_nothing)
5833         {
5834           /* Extract sub-vectors directly once vec_extract becomes
5835              a conversion optab.  */
5836           dst1 = make_ssa_name (vectype1);
5837           epilog_stmt
5838               = gimple_build_assign (dst1, BIT_FIELD_REF,
5839                                      build3 (BIT_FIELD_REF, vectype1,
5840                                              new_temp, TYPE_SIZE (vectype1),
5841                                              bitsize_int (0)));
5842           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5843           dst2 =  make_ssa_name (vectype1);
5844           epilog_stmt
5845               = gimple_build_assign (dst2, BIT_FIELD_REF,
5846                                      build3 (BIT_FIELD_REF, vectype1,
5847                                              new_temp, TYPE_SIZE (vectype1),
5848                                              bitsize_int (bitsize)));
5849           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5850         }
5851       else
5852         {
5853           /* Extract via punning to appropriately sized integer mode
5854              vector.  */
5855           tree eltype = build_nonstandard_integer_type (bitsize, 1);
5856           tree etype = build_vector_type (eltype, 2);
5857           gcc_assert (convert_optab_handler (vec_extract_optab,
5858                                              TYPE_MODE (etype),
5859                                              TYPE_MODE (eltype))
5860                       != CODE_FOR_nothing);
5861           tree tem = make_ssa_name (etype);
5862           epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5863                                              build1 (VIEW_CONVERT_EXPR,
5864                                                      etype, new_temp));
5865           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5866           new_temp = tem;
5867           tem = make_ssa_name (eltype);
5868           epilog_stmt
5869               = gimple_build_assign (tem, BIT_FIELD_REF,
5870                                      build3 (BIT_FIELD_REF, eltype,
5871                                              new_temp, TYPE_SIZE (eltype),
5872                                              bitsize_int (0)));
5873           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5874           dst1 = make_ssa_name (vectype1);
5875           epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5876                                              build1 (VIEW_CONVERT_EXPR,
5877                                                      vectype1, tem));
5878           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5879           tem = make_ssa_name (eltype);
5880           epilog_stmt
5881               = gimple_build_assign (tem, BIT_FIELD_REF,
5882                                      build3 (BIT_FIELD_REF, eltype,
5883                                              new_temp, TYPE_SIZE (eltype),
5884                                              bitsize_int (bitsize)));
5885           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5886           dst2 =  make_ssa_name (vectype1);
5887           epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5888                                              build1 (VIEW_CONVERT_EXPR,
5889                                                      vectype1, tem));
5890           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5891         }
5892
5893       new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5894     }
5895
5896   return new_temp;
5897 }
5898
5899 /* Retrieves the definining statement to be used for a reduction.
5900    For LAST_VAL_REDUC_P we use the current VEC_STMTs which correspond to the
5901    final value after vectorization and otherwise we look at the reduction
5902    definitions to get the first.  */
5903
5904 tree
5905 vect_get_vect_def (stmt_vec_info reduc_info, slp_tree slp_node,
5906                    slp_instance slp_node_instance, bool last_val_reduc_p,
5907                    unsigned i, vec <gimple *> &vec_stmts)
5908 {
5909   tree def;
5910
5911   if (slp_node)
5912     {
5913       if (!last_val_reduc_p)
5914         slp_node = slp_node_instance->reduc_phis;
5915       def = vect_get_slp_vect_def (slp_node, i);
5916     }
5917   else
5918     {
5919       if (!last_val_reduc_p)
5920         reduc_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (reduc_info));
5921       vec_stmts = STMT_VINFO_VEC_STMTS (reduc_info);
5922       def = gimple_get_lhs (vec_stmts[0]);
5923     }
5924
5925   return def;
5926 }
5927
5928 /* Function vect_create_epilog_for_reduction
5929
5930    Create code at the loop-epilog to finalize the result of a reduction
5931    computation.
5932
5933    STMT_INFO is the scalar reduction stmt that is being vectorized.
5934    SLP_NODE is an SLP node containing a group of reduction statements. The
5935      first one in this group is STMT_INFO.
5936    SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5937    REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5938      (counting from 0)
5939    LOOP_EXIT is the edge to update in the merge block.  In the case of a single
5940      exit this edge is always the main loop exit.
5941
5942    This function:
5943    1. Completes the reduction def-use cycles.
5944    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5945       by calling the function specified by REDUC_FN if available, or by
5946       other means (whole-vector shifts or a scalar loop).
5947       The function also creates a new phi node at the loop exit to preserve
5948       loop-closed form, as illustrated below.
5949
5950      The flow at the entry to this function:
5951
5952         loop:
5953           vec_def = phi <vec_init, null>        # REDUCTION_PHI
5954           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5955           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5956         loop_exit:
5957           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5958           use <s_out0>
5959           use <s_out0>
5960
5961      The above is transformed by this function into:
5962
5963         loop:
5964           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
5965           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5966           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5967         loop_exit:
5968           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5969           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5970           v_out2 = reduce <v_out1>
5971           s_out3 = extract_field <v_out2, 0>
5972           s_out4 = adjust_result <s_out3>
5973           use <s_out4>
5974           use <s_out4>
5975 */
5976
5977 static void
5978 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5979                                   stmt_vec_info stmt_info,
5980                                   slp_tree slp_node,
5981                                   slp_instance slp_node_instance,
5982                                   edge loop_exit)
5983 {
5984   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5985   gcc_assert (reduc_info->is_reduc_info);
5986   /* For double reductions we need to get at the inner loop reduction
5987      stmt which has the meta info attached.  Our stmt_info is that of the
5988      loop-closed PHI of the inner loop which we remember as
5989      def for the reduction PHI generation.  */
5990   bool double_reduc = false;
5991   bool last_val_reduc_p = LOOP_VINFO_IV_EXIT (loop_vinfo) == loop_exit
5992                           && !LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo);
5993   stmt_vec_info rdef_info = stmt_info;
5994   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5995     {
5996       gcc_assert (!slp_node);
5997       double_reduc = true;
5998       stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5999                                             (stmt_info->stmt, 0));
6000       stmt_info = vect_stmt_to_vectorize (stmt_info);
6001     }
6002   gphi *reduc_def_stmt
6003     = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
6004   code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
6005   internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
6006   tree vectype;
6007   machine_mode mode;
6008   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
6009   basic_block exit_bb;
6010   tree scalar_dest;
6011   tree scalar_type;
6012   gimple *new_phi = NULL, *phi = NULL;
6013   gimple_stmt_iterator exit_gsi;
6014   tree new_temp = NULL_TREE, new_name, new_scalar_dest;
6015   gimple *epilog_stmt = NULL;
6016   gimple *exit_phi;
6017   tree bitsize;
6018   tree def;
6019   tree orig_name, scalar_result;
6020   imm_use_iterator imm_iter, phi_imm_iter;
6021   use_operand_p use_p, phi_use_p;
6022   gimple *use_stmt;
6023   auto_vec<tree> reduc_inputs;
6024   int j, i;
6025   vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
6026   unsigned int group_size = 1, k;
6027   /* SLP reduction without reduction chain, e.g.,
6028      # a1 = phi <a2, a0>
6029      # b1 = phi <b2, b0>
6030      a2 = operation (a1)
6031      b2 = operation (b1)  */
6032   bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
6033   bool direct_slp_reduc;
6034   tree induction_index = NULL_TREE;
6035
6036   if (slp_node)
6037     group_size = SLP_TREE_LANES (slp_node);
6038
6039   if (nested_in_vect_loop_p (loop, stmt_info))
6040     {
6041       outer_loop = loop;
6042       loop = loop->inner;
6043       gcc_assert (!slp_node && double_reduc);
6044     }
6045
6046   vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
6047   gcc_assert (vectype);
6048   mode = TYPE_MODE (vectype);
6049
6050   tree induc_val = NULL_TREE;
6051   tree adjustment_def = NULL;
6052   if (slp_node)
6053     ;
6054   else
6055     {
6056       /* Optimize: for induction condition reduction, if we can't use zero
6057          for induc_val, use initial_def.  */
6058       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6059         induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
6060       else if (double_reduc)
6061         ;
6062       else
6063         adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
6064     }
6065
6066   stmt_vec_info single_live_out_stmt[] = { stmt_info };
6067   array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
6068   if (slp_reduc)
6069     /* All statements produce live-out values.  */
6070     live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
6071   else if (slp_node)
6072     {
6073       /* The last statement in the reduction chain produces the live-out
6074          value.  Note SLP optimization can shuffle scalar stmts to
6075          optimize permutations so we have to search for the last stmt.  */
6076       for (k = 0; k < group_size; ++k)
6077         if (!REDUC_GROUP_NEXT_ELEMENT (SLP_TREE_SCALAR_STMTS (slp_node)[k]))
6078           {
6079             single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[k];
6080             break;
6081           }
6082     }
6083
6084   unsigned vec_num;
6085   int ncopies;
6086   if (slp_node)
6087     {
6088       vec_num = SLP_TREE_VEC_DEFS (slp_node_instance->reduc_phis).length ();
6089       ncopies = 1;
6090     }
6091   else
6092     {
6093       stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
6094       vec_num = 1;
6095       ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
6096     }
6097
6098   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
6099      which is updated with the current index of the loop for every match of
6100      the original loop's cond_expr (VEC_STMT).  This results in a vector
6101      containing the last time the condition passed for that vector lane.
6102      The first match will be a 1 to allow 0 to be used for non-matching
6103      indexes.  If there are no matches at all then the vector will be all
6104      zeroes.
6105
6106      PR92772: This algorithm is broken for architectures that support
6107      masked vectors, but do not provide fold_extract_last.  */
6108   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
6109     {
6110       auto_vec<std::pair<tree, bool>, 2> ccompares;
6111       stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
6112       cond_info = vect_stmt_to_vectorize (cond_info);
6113       while (cond_info != reduc_info)
6114         {
6115           if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
6116             {
6117               gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
6118               gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
6119               ccompares.safe_push
6120                 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
6121                                  STMT_VINFO_REDUC_IDX (cond_info) == 2));
6122             }
6123           cond_info
6124             = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
6125                                                  1 + STMT_VINFO_REDUC_IDX
6126                                                         (cond_info)));
6127           cond_info = vect_stmt_to_vectorize (cond_info);
6128         }
6129       gcc_assert (ccompares.length () != 0);
6130
6131       tree indx_before_incr, indx_after_incr;
6132       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
6133       int scalar_precision
6134         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
6135       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
6136       tree cr_index_vector_type = get_related_vectype_for_scalar_type
6137         (TYPE_MODE (vectype), cr_index_scalar_type,
6138          TYPE_VECTOR_SUBPARTS (vectype));
6139
6140       /* First we create a simple vector induction variable which starts
6141          with the values {1,2,3,...} (SERIES_VECT) and increments by the
6142          vector size (STEP).  */
6143
6144       /* Create a {1,2,3,...} vector.  */
6145       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
6146
6147       /* Create a vector of the step value.  */
6148       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
6149       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
6150
6151       /* Create an induction variable.  */
6152       gimple_stmt_iterator incr_gsi;
6153       bool insert_after;
6154       vect_iv_increment_position (loop_exit, &incr_gsi, &insert_after);
6155       create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, &incr_gsi,
6156                  insert_after, &indx_before_incr, &indx_after_incr);
6157
6158       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
6159          filled with zeros (VEC_ZERO).  */
6160
6161       /* Create a vector of 0s.  */
6162       tree zero = build_zero_cst (cr_index_scalar_type);
6163       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
6164
6165       /* Create a vector phi node.  */
6166       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
6167       new_phi = create_phi_node (new_phi_tree, loop->header);
6168       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
6169                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
6170
6171       /* Now take the condition from the loops original cond_exprs
6172          and produce a new cond_exprs (INDEX_COND_EXPR) which for
6173          every match uses values from the induction variable
6174          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
6175          (NEW_PHI_TREE).
6176          Finally, we update the phi (NEW_PHI_TREE) to take the value of
6177          the new cond_expr (INDEX_COND_EXPR).  */
6178       gimple_seq stmts = NULL;
6179       for (int i = ccompares.length () - 1; i != -1; --i)
6180         {
6181           tree ccompare = ccompares[i].first;
6182           if (ccompares[i].second)
6183             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
6184                                          cr_index_vector_type,
6185                                          ccompare,
6186                                          indx_before_incr, new_phi_tree);
6187           else
6188             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
6189                                          cr_index_vector_type,
6190                                          ccompare,
6191                                          new_phi_tree, indx_before_incr);
6192         }
6193       gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
6194
6195       /* Update the phi with the vec cond.  */
6196       induction_index = new_phi_tree;
6197       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
6198                    loop_latch_edge (loop), UNKNOWN_LOCATION);
6199     }
6200
6201   /* 2. Create epilog code.
6202         The reduction epilog code operates across the elements of the vector
6203         of partial results computed by the vectorized loop.
6204         The reduction epilog code consists of:
6205
6206         step 1: compute the scalar result in a vector (v_out2)
6207         step 2: extract the scalar result (s_out3) from the vector (v_out2)
6208         step 3: adjust the scalar result (s_out3) if needed.
6209
6210         Step 1 can be accomplished using one the following three schemes:
6211           (scheme 1) using reduc_fn, if available.
6212           (scheme 2) using whole-vector shifts, if available.
6213           (scheme 3) using a scalar loop. In this case steps 1+2 above are
6214                      combined.
6215
6216           The overall epilog code looks like this:
6217
6218           s_out0 = phi <s_loop>         # original EXIT_PHI
6219           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
6220           v_out2 = reduce <v_out1>              # step 1
6221           s_out3 = extract_field <v_out2, 0>    # step 2
6222           s_out4 = adjust_result <s_out3>       # step 3
6223
6224           (step 3 is optional, and steps 1 and 2 may be combined).
6225           Lastly, the uses of s_out0 are replaced by s_out4.  */
6226
6227
6228   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
6229          v_out1 = phi <VECT_DEF>
6230          Store them in NEW_PHIS.  */
6231   if (double_reduc)
6232     loop = outer_loop;
6233   /* We need to reduce values in all exits.  */
6234   exit_bb = loop_exit->dest;
6235   exit_gsi = gsi_after_labels (exit_bb);
6236   reduc_inputs.create (slp_node ? vec_num : ncopies);
6237   vec <gimple *> vec_stmts = vNULL;
6238   for (unsigned i = 0; i < vec_num; i++)
6239     {
6240       gimple_seq stmts = NULL;
6241       def = vect_get_vect_def (rdef_info, slp_node, slp_node_instance,
6242                                last_val_reduc_p, i, vec_stmts);
6243       for (j = 0; j < ncopies; j++)
6244         {
6245           tree new_def = copy_ssa_name (def);
6246           phi = create_phi_node (new_def, exit_bb);
6247           if (j)
6248             def = gimple_get_lhs (vec_stmts[j]);
6249           if (LOOP_VINFO_IV_EXIT (loop_vinfo) == loop_exit)
6250             SET_PHI_ARG_DEF (phi, loop_exit->dest_idx, def);
6251           else
6252             {
6253               for (unsigned k = 0; k < gimple_phi_num_args (phi); k++)
6254                 SET_PHI_ARG_DEF (phi, k, def);
6255             }
6256           new_def = gimple_convert (&stmts, vectype, new_def);
6257           reduc_inputs.quick_push (new_def);
6258         }
6259       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6260     }
6261
6262   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
6263          (i.e. when reduc_fn is not available) and in the final adjustment
6264          code (if needed).  Also get the original scalar reduction variable as
6265          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
6266          represents a reduction pattern), the tree-code and scalar-def are
6267          taken from the original stmt that the pattern-stmt (STMT) replaces.
6268          Otherwise (it is a regular reduction) - the tree-code and scalar-def
6269          are taken from STMT.  */
6270
6271   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6272   if (orig_stmt_info != stmt_info)
6273     {
6274       /* Reduction pattern  */
6275       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6276       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
6277     }
6278
6279   scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
6280   scalar_type = TREE_TYPE (scalar_dest);
6281   scalar_results.truncate (0);
6282   scalar_results.reserve_exact (group_size);
6283   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
6284   bitsize = TYPE_SIZE (scalar_type);
6285
6286   /* True if we should implement SLP_REDUC using native reduction operations
6287      instead of scalar operations.  */
6288   direct_slp_reduc = (reduc_fn != IFN_LAST
6289                       && slp_reduc
6290                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
6291
6292   /* In case of reduction chain, e.g.,
6293      # a1 = phi <a3, a0>
6294      a2 = operation (a1)
6295      a3 = operation (a2),
6296
6297      we may end up with more than one vector result.  Here we reduce them
6298      to one vector.
6299
6300      The same is true for a SLP reduction, e.g.,
6301      # a1 = phi <a2, a0>
6302      # b1 = phi <b2, b0>
6303      a2 = operation (a1)
6304      b2 = operation (a2),
6305
6306      where we can end up with more than one vector as well.  We can
6307      easily accumulate vectors when the number of vector elements is
6308      a multiple of the SLP group size.
6309
6310      The same is true if we couldn't use a single defuse cycle.  */
6311   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6312       || direct_slp_reduc
6313       || (slp_reduc
6314           && constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype), group_size))
6315       || ncopies > 1)
6316     {
6317       gimple_seq stmts = NULL;
6318       tree single_input = reduc_inputs[0];
6319       for (k = 1; k < reduc_inputs.length (); k++)
6320         single_input = gimple_build (&stmts, code, vectype,
6321                                      single_input, reduc_inputs[k]);
6322       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6323
6324       reduc_inputs.truncate (0);
6325       reduc_inputs.safe_push (single_input);
6326     }
6327
6328   tree orig_reduc_input = reduc_inputs[0];
6329
6330   /* If this loop is an epilogue loop that can be skipped after the
6331      main loop, we can only share a reduction operation between the
6332      main loop and the epilogue if we put it at the target of the
6333      skip edge.
6334
6335      We can still reuse accumulators if this check fails.  Doing so has
6336      the minor(?) benefit of making the epilogue loop's scalar result
6337      independent of the main loop's scalar result.  */
6338   bool unify_with_main_loop_p = false;
6339   if (reduc_info->reused_accumulator
6340       && loop_vinfo->skip_this_loop_edge
6341       && single_succ_p (exit_bb)
6342       && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
6343     {
6344       unify_with_main_loop_p = true;
6345
6346       basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
6347       reduc_inputs[0] = make_ssa_name (vectype);
6348       gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
6349       add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
6350                    UNKNOWN_LOCATION);
6351       add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
6352                    loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
6353       exit_gsi = gsi_after_labels (reduc_block);
6354     }
6355
6356   /* Shouldn't be used beyond this point.  */
6357   exit_bb = nullptr;
6358
6359   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6360       && reduc_fn != IFN_LAST)
6361     {
6362       /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
6363          various data values where the condition matched and another vector
6364          (INDUCTION_INDEX) containing all the indexes of those matches.  We
6365          need to extract the last matching index (which will be the index with
6366          highest value) and use this to index into the data vector.
6367          For the case where there were no matches, the data vector will contain
6368          all default values and the index vector will be all zeros.  */
6369
6370       /* Get various versions of the type of the vector of indexes.  */
6371       tree index_vec_type = TREE_TYPE (induction_index);
6372       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
6373       tree index_scalar_type = TREE_TYPE (index_vec_type);
6374       tree index_vec_cmp_type = truth_type_for (index_vec_type);
6375
6376       /* Get an unsigned integer version of the type of the data vector.  */
6377       int scalar_precision
6378         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6379       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
6380       tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
6381                                                 vectype);
6382
6383       /* First we need to create a vector (ZERO_VEC) of zeros and another
6384          vector (MAX_INDEX_VEC) filled with the last matching index, which we
6385          can create using a MAX reduction and then expanding.
6386          In the case where the loop never made any matches, the max index will
6387          be zero.  */
6388
6389       /* Vector of {0, 0, 0,...}.  */
6390       tree zero_vec = build_zero_cst (vectype);
6391
6392       /* Find maximum value from the vector of found indexes.  */
6393       tree max_index = make_ssa_name (index_scalar_type);
6394       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6395                                                           1, induction_index);
6396       gimple_call_set_lhs (max_index_stmt, max_index);
6397       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
6398
6399       /* Vector of {max_index, max_index, max_index,...}.  */
6400       tree max_index_vec = make_ssa_name (index_vec_type);
6401       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
6402                                                       max_index);
6403       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
6404                                                         max_index_vec_rhs);
6405       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
6406
6407       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
6408          with the vector (INDUCTION_INDEX) of found indexes, choosing values
6409          from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
6410          otherwise.  Only one value should match, resulting in a vector
6411          (VEC_COND) with one data value and the rest zeros.
6412          In the case where the loop never made any matches, every index will
6413          match, resulting in a vector with all data values (which will all be
6414          the default value).  */
6415
6416       /* Compare the max index vector to the vector of found indexes to find
6417          the position of the max value.  */
6418       tree vec_compare = make_ssa_name (index_vec_cmp_type);
6419       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
6420                                                       induction_index,
6421                                                       max_index_vec);
6422       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
6423
6424       /* Use the compare to choose either values from the data vector or
6425          zero.  */
6426       tree vec_cond = make_ssa_name (vectype);
6427       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
6428                                                    vec_compare,
6429                                                    reduc_inputs[0],
6430                                                    zero_vec);
6431       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
6432
6433       /* Finally we need to extract the data value from the vector (VEC_COND)
6434          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
6435          reduction, but because this doesn't exist, we can use a MAX reduction
6436          instead.  The data value might be signed or a float so we need to cast
6437          it first.
6438          In the case where the loop never made any matches, the data values are
6439          all identical, and so will reduce down correctly.  */
6440
6441       /* Make the matched data values unsigned.  */
6442       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
6443       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
6444                                        vec_cond);
6445       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
6446                                                         VIEW_CONVERT_EXPR,
6447                                                         vec_cond_cast_rhs);
6448       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
6449
6450       /* Reduce down to a scalar value.  */
6451       tree data_reduc = make_ssa_name (scalar_type_unsigned);
6452       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6453                                                            1, vec_cond_cast);
6454       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
6455       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
6456
6457       /* Convert the reduced value back to the result type and set as the
6458          result.  */
6459       gimple_seq stmts = NULL;
6460       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
6461                                data_reduc);
6462       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6463       scalar_results.safe_push (new_temp);
6464     }
6465   else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6466            && reduc_fn == IFN_LAST)
6467     {
6468       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
6469          idx = 0;
6470          idx_val = induction_index[0];
6471          val = data_reduc[0];
6472          for (idx = 0, val = init, i = 0; i < nelts; ++i)
6473            if (induction_index[i] > idx_val)
6474              val = data_reduc[i], idx_val = induction_index[i];
6475          return val;  */
6476
6477       tree data_eltype = TREE_TYPE (vectype);
6478       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
6479       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
6480       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
6481       /* Enforced by vectorizable_reduction, which ensures we have target
6482          support before allowing a conditional reduction on variable-length
6483          vectors.  */
6484       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
6485       tree idx_val = NULL_TREE, val = NULL_TREE;
6486       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
6487         {
6488           tree old_idx_val = idx_val;
6489           tree old_val = val;
6490           idx_val = make_ssa_name (idx_eltype);
6491           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
6492                                              build3 (BIT_FIELD_REF, idx_eltype,
6493                                                      induction_index,
6494                                                      bitsize_int (el_size),
6495                                                      bitsize_int (off)));
6496           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6497           val = make_ssa_name (data_eltype);
6498           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
6499                                              build3 (BIT_FIELD_REF,
6500                                                      data_eltype,
6501                                                      reduc_inputs[0],
6502                                                      bitsize_int (el_size),
6503                                                      bitsize_int (off)));
6504           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6505           if (off != 0)
6506             {
6507               tree new_idx_val = idx_val;
6508               if (off != v_size - el_size)
6509                 {
6510                   new_idx_val = make_ssa_name (idx_eltype);
6511                   epilog_stmt = gimple_build_assign (new_idx_val,
6512                                                      MAX_EXPR, idx_val,
6513                                                      old_idx_val);
6514                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6515                 }
6516               tree cond = make_ssa_name (boolean_type_node);
6517               epilog_stmt = gimple_build_assign (cond, GT_EXPR,
6518                                                  idx_val, old_idx_val);
6519               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6520               tree new_val = make_ssa_name (data_eltype);
6521               epilog_stmt = gimple_build_assign (new_val, COND_EXPR,
6522                                                  cond, val, old_val);
6523               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6524               idx_val = new_idx_val;
6525               val = new_val;
6526             }
6527         }
6528       /* Convert the reduced value back to the result type and set as the
6529          result.  */
6530       gimple_seq stmts = NULL;
6531       val = gimple_convert (&stmts, scalar_type, val);
6532       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6533       scalar_results.safe_push (val);
6534     }
6535
6536   /* 2.3 Create the reduction code, using one of the three schemes described
6537          above. In SLP we simply need to extract all the elements from the
6538          vector (without reducing them), so we use scalar shifts.  */
6539   else if (reduc_fn != IFN_LAST && !slp_reduc)
6540     {
6541       tree tmp;
6542       tree vec_elem_type;
6543
6544       /* Case 1:  Create:
6545          v_out2 = reduc_expr <v_out1>  */
6546
6547       if (dump_enabled_p ())
6548         dump_printf_loc (MSG_NOTE, vect_location,
6549                          "Reduce using direct vector reduction.\n");
6550
6551       gimple_seq stmts = NULL;
6552       vec_elem_type = TREE_TYPE (vectype);
6553       new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
6554                                vec_elem_type, reduc_inputs[0]);
6555       new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6556       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6557
6558       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6559           && induc_val)
6560         {
6561           /* Earlier we set the initial value to be a vector if induc_val
6562              values.  Check the result and if it is induc_val then replace
6563              with the original initial value, unless induc_val is
6564              the same as initial_def already.  */
6565           tree zcompare = make_ssa_name (boolean_type_node);
6566           epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
6567                                              new_temp, induc_val);
6568           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6569           tree initial_def = reduc_info->reduc_initial_values[0];
6570           tmp = make_ssa_name (new_scalar_dest);
6571           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6572                                              initial_def, new_temp);
6573           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6574           new_temp = tmp;
6575         }
6576
6577       scalar_results.safe_push (new_temp);
6578     }
6579   else if (direct_slp_reduc)
6580     {
6581       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
6582          with the elements for other SLP statements replaced with the
6583          neutral value.  We can then do a normal reduction on each vector.  */
6584
6585       /* Enforced by vectorizable_reduction.  */
6586       gcc_assert (reduc_inputs.length () == 1);
6587       gcc_assert (pow2p_hwi (group_size));
6588
6589       gimple_seq seq = NULL;
6590
6591       /* Build a vector {0, 1, 2, ...}, with the same number of elements
6592          and the same element size as VECTYPE.  */
6593       tree index = build_index_vector (vectype, 0, 1);
6594       tree index_type = TREE_TYPE (index);
6595       tree index_elt_type = TREE_TYPE (index_type);
6596       tree mask_type = truth_type_for (index_type);
6597
6598       /* Create a vector that, for each element, identifies which of
6599          the REDUC_GROUP_SIZE results should use it.  */
6600       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
6601       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
6602                             build_vector_from_val (index_type, index_mask));
6603
6604       /* Get a neutral vector value.  This is simply a splat of the neutral
6605          scalar value if we have one, otherwise the initial scalar value
6606          is itself a neutral value.  */
6607       tree vector_identity = NULL_TREE;
6608       tree neutral_op = NULL_TREE;
6609       if (slp_node)
6610         {
6611           tree initial_value = NULL_TREE;
6612           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6613             initial_value = reduc_info->reduc_initial_values[0];
6614           neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
6615                                                  initial_value, false);
6616         }
6617       if (neutral_op)
6618         vector_identity = gimple_build_vector_from_val (&seq, vectype,
6619                                                         neutral_op);
6620       for (unsigned int i = 0; i < group_size; ++i)
6621         {
6622           /* If there's no univeral neutral value, we can use the
6623              initial scalar value from the original PHI.  This is used
6624              for MIN and MAX reduction, for example.  */
6625           if (!neutral_op)
6626             {
6627               tree scalar_value = reduc_info->reduc_initial_values[i];
6628               scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
6629                                              scalar_value);
6630               vector_identity = gimple_build_vector_from_val (&seq, vectype,
6631                                                               scalar_value);
6632             }
6633
6634           /* Calculate the equivalent of:
6635
6636              sel[j] = (index[j] == i);
6637
6638              which selects the elements of REDUC_INPUTS[0] that should
6639              be included in the result.  */
6640           tree compare_val = build_int_cst (index_elt_type, i);
6641           compare_val = build_vector_from_val (index_type, compare_val);
6642           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
6643                                    index, compare_val);
6644
6645           /* Calculate the equivalent of:
6646
6647              vec = seq ? reduc_inputs[0] : vector_identity;
6648
6649              VEC is now suitable for a full vector reduction.  */
6650           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
6651                                    sel, reduc_inputs[0], vector_identity);
6652
6653           /* Do the reduction and convert it to the appropriate type.  */
6654           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
6655                                       TREE_TYPE (vectype), vec);
6656           scalar = gimple_convert (&seq, scalar_type, scalar);
6657           scalar_results.safe_push (scalar);
6658         }
6659       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
6660     }
6661   else
6662     {
6663       bool reduce_with_shift;
6664       tree vec_temp;
6665
6666       gcc_assert (slp_reduc || reduc_inputs.length () == 1);
6667
6668       /* See if the target wants to do the final (shift) reduction
6669          in a vector mode of smaller size and first reduce upper/lower
6670          halves against each other.  */
6671       enum machine_mode mode1 = mode;
6672       tree stype = TREE_TYPE (vectype);
6673       unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6674       unsigned nunits1 = nunits;
6675       if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
6676           && reduc_inputs.length () == 1)
6677         {
6678           nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6679           /* For SLP reductions we have to make sure lanes match up, but
6680              since we're doing individual element final reduction reducing
6681              vector width here is even more important.
6682              ???  We can also separate lanes with permutes, for the common
6683              case of power-of-two group-size odd/even extracts would work.  */
6684           if (slp_reduc && nunits != nunits1)
6685             {
6686               nunits1 = least_common_multiple (nunits1, group_size);
6687               gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
6688             }
6689         }
6690       if (!slp_reduc
6691           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
6692         nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6693
6694       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
6695                                                            stype, nunits1);
6696       reduce_with_shift = have_whole_vector_shift (mode1);
6697       if (!VECTOR_MODE_P (mode1)
6698           || !directly_supported_p (code, vectype1))
6699         reduce_with_shift = false;
6700
6701       /* First reduce the vector to the desired vector size we should
6702          do shift reduction on by combining upper and lower halves.  */
6703       gimple_seq stmts = NULL;
6704       new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
6705                                              code, &stmts);
6706       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6707       reduc_inputs[0] = new_temp;
6708
6709       if (reduce_with_shift && !slp_reduc)
6710         {
6711           int element_bitsize = tree_to_uhwi (bitsize);
6712           /* Enforced by vectorizable_reduction, which disallows SLP reductions
6713              for variable-length vectors and also requires direct target support
6714              for loop reductions.  */
6715           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6716           int nelements = vec_size_in_bits / element_bitsize;
6717           vec_perm_builder sel;
6718           vec_perm_indices indices;
6719
6720           int elt_offset;
6721
6722           tree zero_vec = build_zero_cst (vectype1);
6723           /* Case 2: Create:
6724              for (offset = nelements/2; offset >= 1; offset/=2)
6725                 {
6726                   Create:  va' = vec_shift <va, offset>
6727                   Create:  va = vop <va, va'>
6728                 }  */
6729
6730           tree rhs;
6731
6732           if (dump_enabled_p ())
6733             dump_printf_loc (MSG_NOTE, vect_location,
6734                              "Reduce using vector shifts\n");
6735
6736           gimple_seq stmts = NULL;
6737           new_temp = gimple_convert (&stmts, vectype1, new_temp);
6738           for (elt_offset = nelements / 2;
6739                elt_offset >= 1;
6740                elt_offset /= 2)
6741             {
6742               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
6743               indices.new_vector (sel, 2, nelements);
6744               tree mask = vect_gen_perm_mask_any (vectype1, indices);
6745               new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
6746                                        new_temp, zero_vec, mask);
6747               new_temp = gimple_build (&stmts, code,
6748                                        vectype1, new_name, new_temp);
6749             }
6750           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6751
6752           /* 2.4  Extract the final scalar result.  Create:
6753              s_out3 = extract_field <v_out2, bitpos>  */
6754
6755           if (dump_enabled_p ())
6756             dump_printf_loc (MSG_NOTE, vect_location,
6757                              "extract scalar result\n");
6758
6759           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
6760                         bitsize, bitsize_zero_node);
6761           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
6762           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
6763           gimple_assign_set_lhs (epilog_stmt, new_temp);
6764           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6765           scalar_results.safe_push (new_temp);
6766         }
6767       else
6768         {
6769           /* Case 3: Create:
6770              s = extract_field <v_out2, 0>
6771              for (offset = element_size;
6772                   offset < vector_size;
6773                   offset += element_size;)
6774                {
6775                  Create:  s' = extract_field <v_out2, offset>
6776                  Create:  s = op <s, s'>  // For non SLP cases
6777                }  */
6778
6779           if (dump_enabled_p ())
6780             dump_printf_loc (MSG_NOTE, vect_location,
6781                              "Reduce using scalar code.\n");
6782
6783           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6784           int element_bitsize = tree_to_uhwi (bitsize);
6785           tree compute_type = TREE_TYPE (vectype);
6786           gimple_seq stmts = NULL;
6787           FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
6788             {
6789               int bit_offset;
6790               new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
6791                                        vec_temp, bitsize, bitsize_zero_node);
6792
6793               /* In SLP we don't need to apply reduction operation, so we just
6794                  collect s' values in SCALAR_RESULTS.  */
6795               if (slp_reduc)
6796                 scalar_results.safe_push (new_temp);
6797
6798               for (bit_offset = element_bitsize;
6799                    bit_offset < vec_size_in_bits;
6800                    bit_offset += element_bitsize)
6801                 {
6802                   tree bitpos = bitsize_int (bit_offset);
6803                   new_name = gimple_build (&stmts, BIT_FIELD_REF,
6804                                            compute_type, vec_temp,
6805                                            bitsize, bitpos);
6806                   if (slp_reduc)
6807                     {
6808                       /* In SLP we don't need to apply reduction operation, so
6809                          we just collect s' values in SCALAR_RESULTS.  */
6810                       new_temp = new_name;
6811                       scalar_results.safe_push (new_name);
6812                     }
6813                   else
6814                     new_temp = gimple_build (&stmts, code, compute_type,
6815                                              new_name, new_temp);
6816                 }
6817             }
6818
6819           /* The only case where we need to reduce scalar results in SLP, is
6820              unrolling.  If the size of SCALAR_RESULTS is greater than
6821              REDUC_GROUP_SIZE, we reduce them combining elements modulo
6822              REDUC_GROUP_SIZE.  */
6823           if (slp_reduc)
6824             {
6825               tree res, first_res, new_res;
6826
6827               /* Reduce multiple scalar results in case of SLP unrolling.  */
6828               for (j = group_size; scalar_results.iterate (j, &res);
6829                    j++)
6830                 {
6831                   first_res = scalar_results[j % group_size];
6832                   new_res = gimple_build (&stmts, code, compute_type,
6833                                           first_res, res);
6834                   scalar_results[j % group_size] = new_res;
6835                 }
6836               scalar_results.truncate (group_size);
6837               for (k = 0; k < group_size; k++)
6838                 scalar_results[k] = gimple_convert (&stmts, scalar_type,
6839                                                     scalar_results[k]);
6840             }
6841           else
6842             {
6843               /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
6844               new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6845               scalar_results.safe_push (new_temp);
6846             }
6847
6848           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6849         }
6850
6851       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6852           && induc_val)
6853         {
6854           /* Earlier we set the initial value to be a vector if induc_val
6855              values.  Check the result and if it is induc_val then replace
6856              with the original initial value, unless induc_val is
6857              the same as initial_def already.  */
6858           tree zcompare = make_ssa_name (boolean_type_node);
6859           epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR, new_temp,
6860                                              induc_val);
6861           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6862           tree initial_def = reduc_info->reduc_initial_values[0];
6863           tree tmp = make_ssa_name (new_scalar_dest);
6864           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6865                                              initial_def, new_temp);
6866           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6867           scalar_results[0] = tmp;
6868         }
6869     }
6870
6871   /* 2.5 Adjust the final result by the initial value of the reduction
6872          variable. (When such adjustment is not needed, then
6873          'adjustment_def' is zero).  For example, if code is PLUS we create:
6874          new_temp = loop_exit_def + adjustment_def  */
6875
6876   if (adjustment_def)
6877     {
6878       gcc_assert (!slp_reduc);
6879       gimple_seq stmts = NULL;
6880       if (double_reduc)
6881         {
6882           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6883           adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6884           new_temp = gimple_build (&stmts, code, vectype,
6885                                    reduc_inputs[0], adjustment_def);
6886         }
6887       else
6888         {
6889           new_temp = scalar_results[0];
6890           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6891           adjustment_def = gimple_convert (&stmts, TREE_TYPE (vectype),
6892                                            adjustment_def);
6893           new_temp = gimple_convert (&stmts, TREE_TYPE (vectype), new_temp);
6894           new_temp = gimple_build (&stmts, code, TREE_TYPE (vectype),
6895                                    new_temp, adjustment_def);
6896           new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6897         }
6898
6899       epilog_stmt = gimple_seq_last_stmt (stmts);
6900       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6901       scalar_results[0] = new_temp;
6902     }
6903
6904   /* Record this operation if it could be reused by the epilogue loop.  */
6905   if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION
6906       && reduc_inputs.length () == 1)
6907     loop_vinfo->reusable_accumulators.put (scalar_results[0],
6908                                            { orig_reduc_input, reduc_info });
6909
6910   if (double_reduc)
6911     loop = outer_loop;
6912
6913   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
6914           phis with new adjusted scalar results, i.e., replace use <s_out0>
6915           with use <s_out4>.
6916
6917      Transform:
6918         loop_exit:
6919           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6920           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6921           v_out2 = reduce <v_out1>
6922           s_out3 = extract_field <v_out2, 0>
6923           s_out4 = adjust_result <s_out3>
6924           use <s_out0>
6925           use <s_out0>
6926
6927      into:
6928
6929         loop_exit:
6930           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6931           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6932           v_out2 = reduce <v_out1>
6933           s_out3 = extract_field <v_out2, 0>
6934           s_out4 = adjust_result <s_out3>
6935           use <s_out4>
6936           use <s_out4> */
6937
6938   gcc_assert (live_out_stmts.size () == scalar_results.length ());
6939   auto_vec<gimple *> phis;
6940   for (k = 0; k < live_out_stmts.size (); k++)
6941     {
6942       stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6943       scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6944
6945       /* Find the loop-closed-use at the loop exit of the original scalar
6946          result.  (The reduction result is expected to have two immediate uses,
6947          one at the latch block, and one at the loop exit).  For double
6948          reductions we are looking for exit phis of the outer loop.  */
6949       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6950         {
6951           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6952             {
6953               if (!is_gimple_debug (USE_STMT (use_p)))
6954                 phis.safe_push (USE_STMT (use_p));
6955             }
6956           else
6957             {
6958               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6959                 {
6960                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
6961
6962                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6963                     {
6964                       if (!flow_bb_inside_loop_p (loop,
6965                                              gimple_bb (USE_STMT (phi_use_p)))
6966                           && !is_gimple_debug (USE_STMT (phi_use_p)))
6967                         phis.safe_push (USE_STMT (phi_use_p));
6968                     }
6969                 }
6970             }
6971         }
6972
6973       FOR_EACH_VEC_ELT (phis, i, exit_phi)
6974         {
6975           /* Replace the uses:  */
6976           orig_name = PHI_RESULT (exit_phi);
6977
6978           /* Look for a single use at the target of the skip edge.  */
6979           if (unify_with_main_loop_p)
6980             {
6981               use_operand_p use_p;
6982               gimple *user;
6983               if (!single_imm_use (orig_name, &use_p, &user))
6984                 gcc_unreachable ();
6985               orig_name = gimple_get_lhs (user);
6986             }
6987
6988           scalar_result = scalar_results[k];
6989           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6990             {
6991               FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6992                 SET_USE (use_p, scalar_result);
6993               update_stmt (use_stmt);
6994             }
6995         }
6996
6997       phis.truncate (0);
6998     }
6999 }
7000
7001 /* Return a vector of type VECTYPE that is equal to the vector select
7002    operation "MASK ? VEC : IDENTITY".  Insert the select statements
7003    before GSI.  */
7004
7005 static tree
7006 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
7007                      tree vec, tree identity)
7008 {
7009   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
7010   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
7011                                           mask, vec, identity);
7012   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
7013   return cond;
7014 }
7015
7016 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
7017    order, starting with LHS.  Insert the extraction statements before GSI and
7018    associate the new scalar SSA names with variable SCALAR_DEST.
7019    If MASK is nonzero mask the input and then operate on it unconditionally.
7020    Return the SSA name for the result.  */
7021
7022 static tree
7023 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
7024                        tree_code code, tree lhs, tree vector_rhs,
7025                        tree mask)
7026 {
7027   tree vectype = TREE_TYPE (vector_rhs);
7028   tree scalar_type = TREE_TYPE (vectype);
7029   tree bitsize = TYPE_SIZE (scalar_type);
7030   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
7031   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
7032
7033   /* Re-create a VEC_COND_EXPR to mask the input here in order to be able
7034      to perform an unconditional element-wise reduction of it.  */
7035   if (mask)
7036     {
7037       tree masked_vector_rhs = make_temp_ssa_name (vectype, NULL,
7038                                                    "masked_vector_rhs");
7039       tree neutral_op = neutral_op_for_reduction (scalar_type, code, NULL_TREE,
7040                                                   false);
7041       tree vector_identity = build_vector_from_val (vectype, neutral_op);
7042       gassign *select = gimple_build_assign (masked_vector_rhs, VEC_COND_EXPR,
7043                                              mask, vector_rhs, vector_identity);
7044       gsi_insert_before (gsi, select, GSI_SAME_STMT);
7045       vector_rhs = masked_vector_rhs;
7046     }
7047
7048   for (unsigned HOST_WIDE_INT bit_offset = 0;
7049        bit_offset < vec_size_in_bits;
7050        bit_offset += element_bitsize)
7051     {
7052       tree bitpos = bitsize_int (bit_offset);
7053       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
7054                          bitsize, bitpos);
7055
7056       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
7057       rhs = make_ssa_name (scalar_dest, stmt);
7058       gimple_assign_set_lhs (stmt, rhs);
7059       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
7060
7061       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
7062       tree new_name = make_ssa_name (scalar_dest, stmt);
7063       gimple_assign_set_lhs (stmt, new_name);
7064       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
7065       lhs = new_name;
7066     }
7067   return lhs;
7068 }
7069
7070 /* Get a masked internal function equivalent to REDUC_FN.  VECTYPE_IN is the
7071    type of the vector input.  */
7072
7073 static internal_fn
7074 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
7075 {
7076   internal_fn mask_reduc_fn;
7077   internal_fn mask_len_reduc_fn;
7078
7079   switch (reduc_fn)
7080     {
7081     case IFN_FOLD_LEFT_PLUS:
7082       mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
7083       mask_len_reduc_fn = IFN_MASK_LEN_FOLD_LEFT_PLUS;
7084       break;
7085
7086     default:
7087       return IFN_LAST;
7088     }
7089
7090   if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
7091                                       OPTIMIZE_FOR_SPEED))
7092     return mask_reduc_fn;
7093   if (direct_internal_fn_supported_p (mask_len_reduc_fn, vectype_in,
7094                                       OPTIMIZE_FOR_SPEED))
7095     return mask_len_reduc_fn;
7096   return IFN_LAST;
7097 }
7098
7099 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
7100    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
7101    statement.  CODE is the operation performed by STMT_INFO and OPS are
7102    its scalar operands.  REDUC_INDEX is the index of the operand in
7103    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
7104    implements in-order reduction, or IFN_LAST if we should open-code it.
7105    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
7106    that should be used to control the operation in a fully-masked loop.  */
7107
7108 static bool
7109 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
7110                                stmt_vec_info stmt_info,
7111                                gimple_stmt_iterator *gsi,
7112                                gimple **vec_stmt, slp_tree slp_node,
7113                                gimple *reduc_def_stmt,
7114                                code_helper code, internal_fn reduc_fn,
7115                                tree *ops, int num_ops, tree vectype_in,
7116                                int reduc_index, vec_loop_masks *masks,
7117                                vec_loop_lens *lens)
7118 {
7119   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7120   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7121   internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
7122
7123   int ncopies;
7124   if (slp_node)
7125     ncopies = 1;
7126   else
7127     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7128
7129   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
7130   gcc_assert (ncopies == 1);
7131
7132   bool is_cond_op = false;
7133   if (!code.is_tree_code ())
7134     {
7135       code = conditional_internal_fn_code (internal_fn (code));
7136       gcc_assert (code != ERROR_MARK);
7137       is_cond_op = true;
7138     }
7139
7140   gcc_assert (TREE_CODE_LENGTH (tree_code (code)) == binary_op);
7141
7142   if (slp_node)
7143     {
7144       if (is_cond_op)
7145         {
7146           if (dump_enabled_p ())
7147             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7148                              "fold-left reduction on SLP not supported.\n");
7149           return false;
7150         }
7151
7152       gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
7153                             TYPE_VECTOR_SUBPARTS (vectype_in)));
7154     }
7155
7156   /* The operands either come from a binary operation or an IFN_COND operation.
7157      The former is a gimple assign with binary rhs and the latter is a
7158      gimple call with four arguments.  */
7159   gcc_assert (num_ops == 2 || num_ops == 4);
7160   tree op0, opmask;
7161   if (!is_cond_op)
7162     op0 = ops[1 - reduc_index];
7163   else
7164     {
7165       op0 = ops[2 + (1 - reduc_index)];
7166       opmask = ops[0];
7167       gcc_assert (!slp_node);
7168     }
7169
7170   int group_size = 1;
7171   stmt_vec_info scalar_dest_def_info;
7172   auto_vec<tree> vec_oprnds0, vec_opmask;
7173   if (slp_node)
7174     {
7175       auto_vec<vec<tree> > vec_defs (2);
7176       vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
7177       vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
7178       vec_defs[0].release ();
7179       vec_defs[1].release ();
7180       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7181       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
7182     }
7183   else
7184     {
7185       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7186                                      op0, &vec_oprnds0);
7187       scalar_dest_def_info = stmt_info;
7188
7189       /* For an IFN_COND_OP we also need the vector mask operand.  */
7190       if (is_cond_op)
7191           vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7192                                          opmask, &vec_opmask);
7193     }
7194
7195   gimple *sdef = vect_orig_stmt (scalar_dest_def_info)->stmt;
7196   tree scalar_dest = gimple_get_lhs (sdef);
7197   tree scalar_type = TREE_TYPE (scalar_dest);
7198   tree reduc_var = gimple_phi_result (reduc_def_stmt);
7199
7200   int vec_num = vec_oprnds0.length ();
7201   gcc_assert (vec_num == 1 || slp_node);
7202   tree vec_elem_type = TREE_TYPE (vectype_out);
7203   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
7204
7205   tree vector_identity = NULL_TREE;
7206   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7207     {
7208       vector_identity = build_zero_cst (vectype_out);
7209       if (!HONOR_SIGNED_ZEROS (vectype_out))
7210         ;
7211       else
7212         {
7213           gcc_assert (!HONOR_SIGN_DEPENDENT_ROUNDING (vectype_out));
7214           vector_identity = const_unop (NEGATE_EXPR, vectype_out,
7215                                         vector_identity);
7216         }
7217     }
7218
7219   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
7220   int i;
7221   tree def0;
7222   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7223     {
7224       gimple *new_stmt;
7225       tree mask = NULL_TREE;
7226       tree len = NULL_TREE;
7227       tree bias = NULL_TREE;
7228       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7229         mask = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num, vectype_in, i);
7230       else if (is_cond_op)
7231         mask = vec_opmask[0];
7232       if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
7233         {
7234           len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
7235                                    i, 1);
7236           signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
7237           bias = build_int_cst (intQI_type_node, biasval);
7238           if (!is_cond_op)
7239             mask = build_minus_one_cst (truth_type_for (vectype_in));
7240         }
7241
7242       /* Handle MINUS by adding the negative.  */
7243       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
7244         {
7245           tree negated = make_ssa_name (vectype_out);
7246           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
7247           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
7248           def0 = negated;
7249         }
7250
7251       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
7252           && mask && mask_reduc_fn == IFN_LAST)
7253         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
7254                                     vector_identity);
7255
7256       /* On the first iteration the input is simply the scalar phi
7257          result, and for subsequent iterations it is the output of
7258          the preceding operation.  */
7259       if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
7260         {
7261           if (mask && len && mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
7262             new_stmt = gimple_build_call_internal (mask_reduc_fn, 5, reduc_var,
7263                                                    def0, mask, len, bias);
7264           else if (mask && mask_reduc_fn == IFN_MASK_FOLD_LEFT_PLUS)
7265             new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
7266                                                    def0, mask);
7267           else
7268             new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
7269                                                    def0);
7270           /* For chained SLP reductions the output of the previous reduction
7271              operation serves as the input of the next. For the final statement
7272              the output cannot be a temporary - we reuse the original
7273              scalar destination of the last statement.  */
7274           if (i != vec_num - 1)
7275             {
7276               gimple_set_lhs (new_stmt, scalar_dest_var);
7277               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
7278               gimple_set_lhs (new_stmt, reduc_var);
7279             }
7280         }
7281       else
7282         {
7283           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var,
7284                                              tree_code (code), reduc_var, def0,
7285                                              mask);
7286           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
7287           /* Remove the statement, so that we can use the same code paths
7288              as for statements that we've just created.  */
7289           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
7290           gsi_remove (&tmp_gsi, true);
7291         }
7292
7293       if (i == vec_num - 1)
7294         {
7295           gimple_set_lhs (new_stmt, scalar_dest);
7296           vect_finish_replace_stmt (loop_vinfo,
7297                                     scalar_dest_def_info,
7298                                     new_stmt);
7299         }
7300       else
7301         vect_finish_stmt_generation (loop_vinfo,
7302                                      scalar_dest_def_info,
7303                                      new_stmt, gsi);
7304
7305       if (slp_node)
7306         slp_node->push_vec_def (new_stmt);
7307       else
7308         {
7309           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7310           *vec_stmt = new_stmt;
7311         }
7312     }
7313
7314   return true;
7315 }
7316
7317 /* Function is_nonwrapping_integer_induction.
7318
7319    Check if STMT_VINO (which is part of loop LOOP) both increments and
7320    does not cause overflow.  */
7321
7322 static bool
7323 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
7324 {
7325   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
7326   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
7327   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
7328   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
7329   widest_int ni, max_loop_value, lhs_max;
7330   wi::overflow_type overflow = wi::OVF_NONE;
7331
7332   /* Make sure the loop is integer based.  */
7333   if (TREE_CODE (base) != INTEGER_CST
7334       || TREE_CODE (step) != INTEGER_CST)
7335     return false;
7336
7337   /* Check that the max size of the loop will not wrap.  */
7338
7339   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
7340     return true;
7341
7342   if (! max_stmt_executions (loop, &ni))
7343     return false;
7344
7345   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
7346                             &overflow);
7347   if (overflow)
7348     return false;
7349
7350   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
7351                             TYPE_SIGN (lhs_type), &overflow);
7352   if (overflow)
7353     return false;
7354
7355   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
7356           <= TYPE_PRECISION (lhs_type));
7357 }
7358
7359 /* Check if masking can be supported by inserting a conditional expression.
7360    CODE is the code for the operation.  COND_FN is the conditional internal
7361    function, if it exists.  VECTYPE_IN is the type of the vector input.  */
7362 static bool
7363 use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
7364                          tree vectype_in)
7365 {
7366   if (cond_fn != IFN_LAST
7367       && direct_internal_fn_supported_p (cond_fn, vectype_in,
7368                                          OPTIMIZE_FOR_SPEED))
7369     return false;
7370
7371   if (code.is_tree_code ())
7372     switch (tree_code (code))
7373       {
7374       case DOT_PROD_EXPR:
7375       case SAD_EXPR:
7376         return true;
7377
7378       default:
7379         break;
7380       }
7381   return false;
7382 }
7383
7384 /* Insert a conditional expression to enable masked vectorization.  CODE is the
7385    code for the operation.  VOP is the array of operands.  MASK is the loop
7386    mask.  GSI is a statement iterator used to place the new conditional
7387    expression.  */
7388 static void
7389 build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
7390                       gimple_stmt_iterator *gsi)
7391 {
7392   switch (tree_code (code))
7393     {
7394     case DOT_PROD_EXPR:
7395       {
7396         tree vectype = TREE_TYPE (vop[1]);
7397         tree zero = build_zero_cst (vectype);
7398         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7399         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7400                                                mask, vop[1], zero);
7401         gsi_insert_before (gsi, select, GSI_SAME_STMT);
7402         vop[1] = masked_op1;
7403         break;
7404       }
7405
7406     case SAD_EXPR:
7407       {
7408         tree vectype = TREE_TYPE (vop[1]);
7409         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7410         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7411                                                mask, vop[1], vop[0]);
7412         gsi_insert_before (gsi, select, GSI_SAME_STMT);
7413         vop[1] = masked_op1;
7414         break;
7415       }
7416
7417     default:
7418       gcc_unreachable ();
7419     }
7420 }
7421
7422 /* Function vectorizable_reduction.
7423
7424    Check if STMT_INFO performs a reduction operation that can be vectorized.
7425    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
7426    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
7427    Return true if STMT_INFO is vectorizable in this way.
7428
7429    This function also handles reduction idioms (patterns) that have been
7430    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
7431    may be of this form:
7432      X = pattern_expr (arg0, arg1, ..., X)
7433    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
7434    sequence that had been detected and replaced by the pattern-stmt
7435    (STMT_INFO).
7436
7437    This function also handles reduction of condition expressions, for example:
7438      for (int i = 0; i < N; i++)
7439        if (a[i] < value)
7440          last = a[i];
7441    This is handled by vectorising the loop and creating an additional vector
7442    containing the loop indexes for which "a[i] < value" was true.  In the
7443    function epilogue this is reduced to a single max value and then used to
7444    index into the vector of results.
7445
7446    In some cases of reduction patterns, the type of the reduction variable X is
7447    different than the type of the other arguments of STMT_INFO.
7448    In such cases, the vectype that is used when transforming STMT_INFO into
7449    a vector stmt is different than the vectype that is used to determine the
7450    vectorization factor, because it consists of a different number of elements
7451    than the actual number of elements that are being operated upon in parallel.
7452
7453    For example, consider an accumulation of shorts into an int accumulator.
7454    On some targets it's possible to vectorize this pattern operating on 8
7455    shorts at a time (hence, the vectype for purposes of determining the
7456    vectorization factor should be V8HI); on the other hand, the vectype that
7457    is used to create the vector form is actually V4SI (the type of the result).
7458
7459    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
7460    indicates what is the actual level of parallelism (V8HI in the example), so
7461    that the right vectorization factor would be derived.  This vectype
7462    corresponds to the type of arguments to the reduction stmt, and should *NOT*
7463    be used to create the vectorized stmt.  The right vectype for the vectorized
7464    stmt is obtained from the type of the result X:
7465       get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7466
7467    This means that, contrary to "regular" reductions (or "regular" stmts in
7468    general), the following equation:
7469       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7470    does *NOT* necessarily hold for reduction patterns.  */
7471
7472 bool
7473 vectorizable_reduction (loop_vec_info loop_vinfo,
7474                         stmt_vec_info stmt_info, slp_tree slp_node,
7475                         slp_instance slp_node_instance,
7476                         stmt_vector_for_cost *cost_vec)
7477 {
7478   tree vectype_in = NULL_TREE;
7479   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7480   enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
7481   stmt_vec_info cond_stmt_vinfo = NULL;
7482   int i;
7483   int ncopies;
7484   bool single_defuse_cycle = false;
7485   bool nested_cycle = false;
7486   bool double_reduc = false;
7487   int vec_num;
7488   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
7489   tree cond_reduc_val = NULL_TREE;
7490
7491   /* Make sure it was already recognized as a reduction computation.  */
7492   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
7493       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
7494       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
7495     return false;
7496
7497   /* The stmt we store reduction analysis meta on.  */
7498   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7499   reduc_info->is_reduc_info = true;
7500
7501   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
7502     {
7503       if (is_a <gphi *> (stmt_info->stmt))
7504         {
7505           if (slp_node)
7506             {
7507               /* We eventually need to set a vector type on invariant
7508                  arguments.  */
7509               unsigned j;
7510               slp_tree child;
7511               FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
7512                 if (!vect_maybe_update_slp_op_vectype
7513                        (child, SLP_TREE_VECTYPE (slp_node)))
7514                   {
7515                     if (dump_enabled_p ())
7516                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7517                                        "incompatible vector types for "
7518                                        "invariants\n");
7519                     return false;
7520                   }
7521             }
7522           /* Analysis for double-reduction is done on the outer
7523              loop PHI, nested cycles have no further restrictions.  */
7524           STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
7525         }
7526       else
7527         STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7528       return true;
7529     }
7530
7531   stmt_vec_info orig_stmt_of_analysis = stmt_info;
7532   stmt_vec_info phi_info = stmt_info;
7533   if (!is_a <gphi *> (stmt_info->stmt))
7534     {
7535       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7536       return true;
7537     }
7538   if (slp_node)
7539     {
7540       slp_node_instance->reduc_phis = slp_node;
7541       /* ???  We're leaving slp_node to point to the PHIs, we only
7542          need it to get at the number of vector stmts which wasn't
7543          yet initialized for the instance root.  */
7544     }
7545   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7546     {
7547       use_operand_p use_p;
7548       gimple *use_stmt;
7549       bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
7550                                  &use_p, &use_stmt);
7551       gcc_assert (res);
7552       phi_info = loop_vinfo->lookup_stmt (use_stmt);
7553     }
7554
7555   /* PHIs should not participate in patterns.  */
7556   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7557   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7558
7559   /* Verify following REDUC_IDX from the latch def leads us back to the PHI
7560      and compute the reduction chain length.  Discover the real
7561      reduction operation stmt on the way (stmt_info and slp_for_stmt_info).  */
7562   tree reduc_def
7563     = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
7564                              loop_latch_edge
7565                                (gimple_bb (reduc_def_phi)->loop_father));
7566   unsigned reduc_chain_length = 0;
7567   bool only_slp_reduc_chain = true;
7568   stmt_info = NULL;
7569   slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
7570   while (reduc_def != PHI_RESULT (reduc_def_phi))
7571     {
7572       stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
7573       stmt_vec_info vdef = vect_stmt_to_vectorize (def);
7574       if (STMT_VINFO_REDUC_IDX (vdef) == -1)
7575         {
7576           if (dump_enabled_p ())
7577             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7578                              "reduction chain broken by patterns.\n");
7579           return false;
7580         }
7581       if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
7582         only_slp_reduc_chain = false;
7583       /* For epilogue generation live members of the chain need
7584          to point back to the PHI via their original stmt for
7585          info_for_reduction to work.  For SLP we need to look at
7586          all lanes here - even though we only will vectorize from
7587          the SLP node with live lane zero the other live lanes also
7588          need to be identified as part of a reduction to be able
7589          to skip code generation for them.  */
7590       if (slp_for_stmt_info)
7591         {
7592           for (auto s : SLP_TREE_SCALAR_STMTS (slp_for_stmt_info))
7593             if (STMT_VINFO_LIVE_P (s))
7594               STMT_VINFO_REDUC_DEF (vect_orig_stmt (s)) = phi_info;
7595         }
7596       else if (STMT_VINFO_LIVE_P (vdef))
7597         STMT_VINFO_REDUC_DEF (def) = phi_info;
7598       gimple_match_op op;
7599       if (!gimple_extract_op (vdef->stmt, &op))
7600         {
7601           if (dump_enabled_p ())
7602             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7603                              "reduction chain includes unsupported"
7604                              " statement type.\n");
7605           return false;
7606         }
7607       if (CONVERT_EXPR_CODE_P (op.code))
7608         {
7609           if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
7610             {
7611               if (dump_enabled_p ())
7612                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7613                                  "conversion in the reduction chain.\n");
7614               return false;
7615             }
7616         }
7617       else if (!stmt_info)
7618         /* First non-conversion stmt.  */
7619         stmt_info = vdef;
7620       reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
7621       reduc_chain_length++;
7622       if (!stmt_info && slp_node)
7623         slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
7624     }
7625   /* PHIs should not participate in patterns.  */
7626   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7627
7628   if (nested_in_vect_loop_p (loop, stmt_info))
7629     {
7630       loop = loop->inner;
7631       nested_cycle = true;
7632     }
7633
7634   /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
7635      element.  */
7636   if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7637     {
7638       gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
7639       stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
7640     }
7641   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7642     gcc_assert (slp_node
7643                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
7644
7645   /* 1. Is vectorizable reduction?  */
7646   /* Not supportable if the reduction variable is used in the loop, unless
7647      it's a reduction chain.  */
7648   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
7649       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7650     return false;
7651
7652   /* Reductions that are not used even in an enclosing outer-loop,
7653      are expected to be "live" (used out of the loop).  */
7654   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
7655       && !STMT_VINFO_LIVE_P (stmt_info))
7656     return false;
7657
7658   /* 2. Has this been recognized as a reduction pattern?
7659
7660      Check if STMT represents a pattern that has been recognized
7661      in earlier analysis stages.  For stmts that represent a pattern,
7662      the STMT_VINFO_RELATED_STMT field records the last stmt in
7663      the original sequence that constitutes the pattern.  */
7664
7665   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
7666   if (orig_stmt_info)
7667     {
7668       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
7669       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
7670     }
7671
7672   /* 3. Check the operands of the operation.  The first operands are defined
7673         inside the loop body. The last operand is the reduction variable,
7674         which is defined by the loop-header-phi.  */
7675
7676   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7677   STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
7678   gimple_match_op op;
7679   if (!gimple_extract_op (stmt_info->stmt, &op))
7680     gcc_unreachable ();
7681   bool lane_reduc_code_p = (op.code == DOT_PROD_EXPR
7682                             || op.code == WIDEN_SUM_EXPR
7683                             || op.code == SAD_EXPR);
7684
7685   if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
7686       && !SCALAR_FLOAT_TYPE_P (op.type))
7687     return false;
7688
7689   /* Do not try to vectorize bit-precision reductions.  */
7690   if (!type_has_mode_precision_p (op.type))
7691     return false;
7692
7693   /* For lane-reducing ops we're reducing the number of reduction PHIs
7694      which means the only use of that may be in the lane-reducing operation.  */
7695   if (lane_reduc_code_p
7696       && reduc_chain_length != 1
7697       && !only_slp_reduc_chain)
7698     {
7699       if (dump_enabled_p ())
7700         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7701                          "lane-reducing reduction with extra stmts.\n");
7702       return false;
7703     }
7704
7705   /* All uses but the last are expected to be defined in the loop.
7706      The last use is the reduction variable.  In case of nested cycle this
7707      assumption is not true: we use reduc_index to record the index of the
7708      reduction variable.  */
7709   slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
7710   tree *vectype_op = XALLOCAVEC (tree, op.num_ops);
7711   /* We need to skip an extra operand for COND_EXPRs with embedded
7712      comparison.  */
7713   unsigned opno_adjust = 0;
7714   if (op.code == COND_EXPR && COMPARISON_CLASS_P (op.ops[0]))
7715     opno_adjust = 1;
7716   for (i = 0; i < (int) op.num_ops; i++)
7717     {
7718       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
7719       if (i == 0 && op.code == COND_EXPR)
7720         continue;
7721
7722       stmt_vec_info def_stmt_info;
7723       enum vect_def_type dt;
7724       if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
7725                                i + opno_adjust, &op.ops[i], &slp_op[i], &dt,
7726                                &vectype_op[i], &def_stmt_info))
7727         {
7728           if (dump_enabled_p ())
7729             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7730                              "use not simple.\n");
7731           return false;
7732         }
7733       if (i == STMT_VINFO_REDUC_IDX (stmt_info))
7734         continue;
7735
7736       /* For an IFN_COND_OP we might hit the reduction definition operand
7737          twice (once as definition, once as else).  */
7738       if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
7739         continue;
7740
7741       /* There should be only one cycle def in the stmt, the one
7742          leading to reduc_def.  */
7743       if (VECTORIZABLE_CYCLE_DEF (dt))
7744         return false;
7745
7746       if (!vectype_op[i])
7747         vectype_op[i]
7748           = get_vectype_for_scalar_type (loop_vinfo,
7749                                          TREE_TYPE (op.ops[i]), slp_op[i]);
7750
7751       /* To properly compute ncopies we are interested in the widest
7752          non-reduction input type in case we're looking at a widening
7753          accumulation that we later handle in vect_transform_reduction.  */
7754       if (lane_reduc_code_p
7755           && vectype_op[i]
7756           && (!vectype_in
7757               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
7758                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_op[i]))))))
7759         vectype_in = vectype_op[i];
7760
7761       if (op.code == COND_EXPR)
7762         {
7763           /* Record how the non-reduction-def value of COND_EXPR is defined.  */
7764           if (dt == vect_constant_def)
7765             {
7766               cond_reduc_dt = dt;
7767               cond_reduc_val = op.ops[i];
7768             }
7769           if (dt == vect_induction_def
7770               && def_stmt_info
7771               && is_nonwrapping_integer_induction (def_stmt_info, loop))
7772             {
7773               cond_reduc_dt = dt;
7774               cond_stmt_vinfo = def_stmt_info;
7775             }
7776         }
7777     }
7778   if (!vectype_in)
7779     vectype_in = STMT_VINFO_VECTYPE (phi_info);
7780   STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
7781
7782   enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
7783   STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
7784   /* If we have a condition reduction, see if we can simplify it further.  */
7785   if (v_reduc_type == COND_REDUCTION)
7786     {
7787       if (slp_node)
7788         return false;
7789
7790       /* When the condition uses the reduction value in the condition, fail.  */
7791       if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
7792         {
7793           if (dump_enabled_p ())
7794             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7795                              "condition depends on previous iteration\n");
7796           return false;
7797         }
7798
7799       if (reduc_chain_length == 1
7800           && (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST, vectype_in,
7801                                               OPTIMIZE_FOR_SPEED)
7802               || direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
7803                                                  vectype_in,
7804                                                  OPTIMIZE_FOR_SPEED)))
7805         {
7806           if (dump_enabled_p ())
7807             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7808                              "optimizing condition reduction with"
7809                              " FOLD_EXTRACT_LAST.\n");
7810           STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
7811         }
7812       else if (cond_reduc_dt == vect_induction_def)
7813         {
7814           tree base
7815             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
7816           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
7817
7818           gcc_assert (TREE_CODE (base) == INTEGER_CST
7819                       && TREE_CODE (step) == INTEGER_CST);
7820           cond_reduc_val = NULL_TREE;
7821           enum tree_code cond_reduc_op_code = ERROR_MARK;
7822           tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
7823           if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
7824             ;
7825           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
7826              above base; punt if base is the minimum value of the type for
7827              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
7828           else if (tree_int_cst_sgn (step) == -1)
7829             {
7830               cond_reduc_op_code = MIN_EXPR;
7831               if (tree_int_cst_sgn (base) == -1)
7832                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7833               else if (tree_int_cst_lt (base,
7834                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
7835                 cond_reduc_val
7836                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
7837             }
7838           else
7839             {
7840               cond_reduc_op_code = MAX_EXPR;
7841               if (tree_int_cst_sgn (base) == 1)
7842                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7843               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
7844                                         base))
7845                 cond_reduc_val
7846                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
7847             }
7848           if (cond_reduc_val)
7849             {
7850               if (dump_enabled_p ())
7851                 dump_printf_loc (MSG_NOTE, vect_location,
7852                                  "condition expression based on "
7853                                  "integer induction.\n");
7854               STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
7855               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
7856                 = cond_reduc_val;
7857               STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
7858             }
7859         }
7860       else if (cond_reduc_dt == vect_constant_def)
7861         {
7862           enum vect_def_type cond_initial_dt;
7863           tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
7864           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
7865           if (cond_initial_dt == vect_constant_def
7866               && types_compatible_p (TREE_TYPE (cond_initial_val),
7867                                      TREE_TYPE (cond_reduc_val)))
7868             {
7869               tree e = fold_binary (LE_EXPR, boolean_type_node,
7870                                     cond_initial_val, cond_reduc_val);
7871               if (e && (integer_onep (e) || integer_zerop (e)))
7872                 {
7873                   if (dump_enabled_p ())
7874                     dump_printf_loc (MSG_NOTE, vect_location,
7875                                      "condition expression based on "
7876                                      "compile time constant.\n");
7877                   /* Record reduction code at analysis stage.  */
7878                   STMT_VINFO_REDUC_CODE (reduc_info)
7879                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
7880                   STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
7881                 }
7882             }
7883         }
7884     }
7885
7886   if (STMT_VINFO_LIVE_P (phi_info))
7887     return false;
7888
7889   if (slp_node)
7890     ncopies = 1;
7891   else
7892     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7893
7894   gcc_assert (ncopies >= 1);
7895
7896   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
7897
7898   if (nested_cycle)
7899     {
7900       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
7901                   == vect_double_reduction_def);
7902       double_reduc = true;
7903     }
7904
7905   /* 4.2. Check support for the epilog operation.
7906
7907           If STMT represents a reduction pattern, then the type of the
7908           reduction variable may be different than the type of the rest
7909           of the arguments.  For example, consider the case of accumulation
7910           of shorts into an int accumulator; The original code:
7911                         S1: int_a = (int) short_a;
7912           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
7913
7914           was replaced with:
7915                         STMT: int_acc = widen_sum <short_a, int_acc>
7916
7917           This means that:
7918           1. The tree-code that is used to create the vector operation in the
7919              epilog code (that reduces the partial results) is not the
7920              tree-code of STMT, but is rather the tree-code of the original
7921              stmt from the pattern that STMT is replacing.  I.e, in the example
7922              above we want to use 'widen_sum' in the loop, but 'plus' in the
7923              epilog.
7924           2. The type (mode) we use to check available target support
7925              for the vector operation to be created in the *epilog*, is
7926              determined by the type of the reduction variable (in the example
7927              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7928              However the type (mode) we use to check available target support
7929              for the vector operation to be created *inside the loop*, is
7930              determined by the type of the other arguments to STMT (in the
7931              example we'd check this: optab_handler (widen_sum_optab,
7932              vect_short_mode)).
7933
7934           This is contrary to "regular" reductions, in which the types of all
7935           the arguments are the same as the type of the reduction variable.
7936           For "regular" reductions we can therefore use the same vector type
7937           (and also the same tree-code) when generating the epilog code and
7938           when generating the code inside the loop.  */
7939
7940   code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
7941
7942   /* If conversion might have created a conditional operation like
7943      IFN_COND_ADD already.  Use the internal code for the following checks.  */
7944   if (orig_code.is_internal_fn ())
7945     {
7946       tree_code new_code = conditional_internal_fn_code (internal_fn (orig_code));
7947       orig_code = new_code != ERROR_MARK ? new_code : orig_code;
7948     }
7949
7950   STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
7951
7952   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7953   if (reduction_type == TREE_CODE_REDUCTION)
7954     {
7955       /* Check whether it's ok to change the order of the computation.
7956          Generally, when vectorizing a reduction we change the order of the
7957          computation.  This may change the behavior of the program in some
7958          cases, so we need to check that this is ok.  One exception is when
7959          vectorizing an outer-loop: the inner-loop is executed sequentially,
7960          and therefore vectorizing reductions in the inner-loop during
7961          outer-loop vectorization is safe.  Likewise when we are vectorizing
7962          a series of reductions using SLP and the VF is one the reductions
7963          are performed in scalar order.  */
7964       if (slp_node
7965           && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7966           && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
7967         ;
7968       else if (needs_fold_left_reduction_p (op.type, orig_code))
7969         {
7970           /* When vectorizing a reduction chain w/o SLP the reduction PHI
7971              is not directy used in stmt.  */
7972           if (!only_slp_reduc_chain
7973               && reduc_chain_length != 1)
7974             {
7975               if (dump_enabled_p ())
7976                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7977                                  "in-order reduction chain without SLP.\n");
7978               return false;
7979             }
7980           STMT_VINFO_REDUC_TYPE (reduc_info)
7981             = reduction_type = FOLD_LEFT_REDUCTION;
7982         }
7983       else if (!commutative_binary_op_p (orig_code, op.type)
7984                || !associative_binary_op_p (orig_code, op.type))
7985         {
7986           if (dump_enabled_p ())
7987             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7988                             "reduction: not commutative/associative\n");
7989           return false;
7990         }
7991     }
7992
7993   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7994       && ncopies > 1)
7995     {
7996       if (dump_enabled_p ())
7997         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7998                          "multiple types in double reduction or condition "
7999                          "reduction or fold-left reduction.\n");
8000       return false;
8001     }
8002
8003   internal_fn reduc_fn = IFN_LAST;
8004   if (reduction_type == TREE_CODE_REDUCTION
8005       || reduction_type == FOLD_LEFT_REDUCTION
8006       || reduction_type == INTEGER_INDUC_COND_REDUCTION
8007       || reduction_type == CONST_COND_REDUCTION)
8008     {
8009       if (reduction_type == FOLD_LEFT_REDUCTION
8010           ? fold_left_reduction_fn (orig_code, &reduc_fn)
8011           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
8012         {
8013           if (reduc_fn != IFN_LAST
8014               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
8015                                                   OPTIMIZE_FOR_SPEED))
8016             {
8017               if (dump_enabled_p ())
8018                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8019                                  "reduc op not supported by target.\n");
8020
8021               reduc_fn = IFN_LAST;
8022             }
8023         }
8024       else
8025         {
8026           if (!nested_cycle || double_reduc)
8027             {
8028               if (dump_enabled_p ())
8029                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8030                                  "no reduc code for scalar code.\n");
8031
8032               return false;
8033             }
8034         }
8035     }
8036   else if (reduction_type == COND_REDUCTION)
8037     {
8038       int scalar_precision
8039         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
8040       cr_index_scalar_type = make_unsigned_type (scalar_precision);
8041       cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
8042                                                 vectype_out);
8043
8044       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
8045                                           OPTIMIZE_FOR_SPEED))
8046         reduc_fn = IFN_REDUC_MAX;
8047     }
8048   STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
8049
8050   if (reduction_type != EXTRACT_LAST_REDUCTION
8051       && (!nested_cycle || double_reduc)
8052       && reduc_fn == IFN_LAST
8053       && !nunits_out.is_constant ())
8054     {
8055       if (dump_enabled_p ())
8056         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8057                          "missing target support for reduction on"
8058                          " variable-length vectors.\n");
8059       return false;
8060     }
8061
8062   /* For SLP reductions, see if there is a neutral value we can use.  */
8063   tree neutral_op = NULL_TREE;
8064   if (slp_node)
8065     {
8066       tree initial_value = NULL_TREE;
8067       if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
8068         initial_value = vect_phi_initial_value (reduc_def_phi);
8069       neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8070                                              orig_code, initial_value);
8071     }
8072
8073   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
8074     {
8075       /* We can't support in-order reductions of code such as this:
8076
8077            for (int i = 0; i < n1; ++i)
8078              for (int j = 0; j < n2; ++j)
8079                l += a[j];
8080
8081          since GCC effectively transforms the loop when vectorizing:
8082
8083            for (int i = 0; i < n1 / VF; ++i)
8084              for (int j = 0; j < n2; ++j)
8085                for (int k = 0; k < VF; ++k)
8086                  l += a[j];
8087
8088          which is a reassociation of the original operation.  */
8089       if (dump_enabled_p ())
8090         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8091                          "in-order double reduction not supported.\n");
8092
8093       return false;
8094     }
8095
8096   if (reduction_type == FOLD_LEFT_REDUCTION
8097       && slp_node
8098       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
8099     {
8100       /* We cannot use in-order reductions in this case because there is
8101          an implicit reassociation of the operations involved.  */
8102       if (dump_enabled_p ())
8103         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8104                          "in-order unchained SLP reductions not supported.\n");
8105       return false;
8106     }
8107
8108   /* For double reductions, and for SLP reductions with a neutral value,
8109      we construct a variable-length initial vector by loading a vector
8110      full of the neutral value and then shift-and-inserting the start
8111      values into the low-numbered elements.  */
8112   if ((double_reduc || neutral_op)
8113       && !nunits_out.is_constant ()
8114       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
8115                                           vectype_out, OPTIMIZE_FOR_SPEED))
8116     {
8117       if (dump_enabled_p ())
8118         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8119                          "reduction on variable-length vectors requires"
8120                          " target support for a vector-shift-and-insert"
8121                          " operation.\n");
8122       return false;
8123     }
8124
8125   /* Check extra constraints for variable-length unchained SLP reductions.  */
8126   if (slp_node
8127       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
8128       && !nunits_out.is_constant ())
8129     {
8130       /* We checked above that we could build the initial vector when
8131          there's a neutral element value.  Check here for the case in
8132          which each SLP statement has its own initial value and in which
8133          that value needs to be repeated for every instance of the
8134          statement within the initial vector.  */
8135       unsigned int group_size = SLP_TREE_LANES (slp_node);
8136       if (!neutral_op
8137           && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
8138                                               TREE_TYPE (vectype_out)))
8139         {
8140           if (dump_enabled_p ())
8141             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8142                              "unsupported form of SLP reduction for"
8143                              " variable-length vectors: cannot build"
8144                              " initial vector.\n");
8145           return false;
8146         }
8147       /* The epilogue code relies on the number of elements being a multiple
8148          of the group size.  The duplicate-and-interleave approach to setting
8149          up the initial vector does too.  */
8150       if (!multiple_p (nunits_out, group_size))
8151         {
8152           if (dump_enabled_p ())
8153             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8154                              "unsupported form of SLP reduction for"
8155                              " variable-length vectors: the vector size"
8156                              " is not a multiple of the number of results.\n");
8157           return false;
8158         }
8159     }
8160
8161   if (reduction_type == COND_REDUCTION)
8162     {
8163       widest_int ni;
8164
8165       if (! max_loop_iterations (loop, &ni))
8166         {
8167           if (dump_enabled_p ())
8168             dump_printf_loc (MSG_NOTE, vect_location,
8169                              "loop count not known, cannot create cond "
8170                              "reduction.\n");
8171           return false;
8172         }
8173       /* Convert backedges to iterations.  */
8174       ni += 1;
8175
8176       /* The additional index will be the same type as the condition.  Check
8177          that the loop can fit into this less one (because we'll use up the
8178          zero slot for when there are no matches).  */
8179       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
8180       if (wi::geu_p (ni, wi::to_widest (max_index)))
8181         {
8182           if (dump_enabled_p ())
8183             dump_printf_loc (MSG_NOTE, vect_location,
8184                              "loop size is greater than data size.\n");
8185           return false;
8186         }
8187     }
8188
8189   /* In case the vectorization factor (VF) is bigger than the number
8190      of elements that we can fit in a vectype (nunits), we have to generate
8191      more than one vector stmt - i.e - we need to "unroll" the
8192      vector stmt by a factor VF/nunits.  For more details see documentation
8193      in vectorizable_operation.  */
8194
8195   /* If the reduction is used in an outer loop we need to generate
8196      VF intermediate results, like so (e.g. for ncopies=2):
8197         r0 = phi (init, r0)
8198         r1 = phi (init, r1)
8199         r0 = x0 + r0;
8200         r1 = x1 + r1;
8201     (i.e. we generate VF results in 2 registers).
8202     In this case we have a separate def-use cycle for each copy, and therefore
8203     for each copy we get the vector def for the reduction variable from the
8204     respective phi node created for this copy.
8205
8206     Otherwise (the reduction is unused in the loop nest), we can combine
8207     together intermediate results, like so (e.g. for ncopies=2):
8208         r = phi (init, r)
8209         r = x0 + r;
8210         r = x1 + r;
8211    (i.e. we generate VF/2 results in a single register).
8212    In this case for each copy we get the vector def for the reduction variable
8213    from the vectorized reduction operation generated in the previous iteration.
8214
8215    This only works when we see both the reduction PHI and its only consumer
8216    in vectorizable_reduction and there are no intermediate stmts
8217    participating.  When unrolling we want each unrolled iteration to have its
8218    own reduction accumulator since one of the main goals of unrolling a
8219    reduction is to reduce the aggregate loop-carried latency.  */
8220   if (ncopies > 1
8221       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
8222       && reduc_chain_length == 1
8223       && loop_vinfo->suggested_unroll_factor == 1)
8224     single_defuse_cycle = true;
8225
8226   if (single_defuse_cycle || lane_reduc_code_p)
8227     {
8228       gcc_assert (op.code != COND_EXPR);
8229
8230       /* 4. Supportable by target?  */
8231       bool ok = true;
8232
8233       /* 4.1. check support for the operation in the loop
8234
8235          This isn't necessary for the lane reduction codes, since they
8236          can only be produced by pattern matching, and it's up to the
8237          pattern matcher to test for support.  The main reason for
8238          specifically skipping this step is to avoid rechecking whether
8239          mixed-sign dot-products can be implemented using signed
8240          dot-products.  */
8241       machine_mode vec_mode = TYPE_MODE (vectype_in);
8242       if (!lane_reduc_code_p
8243           && !directly_supported_p (op.code, vectype_in, optab_vector))
8244         {
8245           if (dump_enabled_p ())
8246             dump_printf (MSG_NOTE, "op not supported by target.\n");
8247           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
8248               || !vect_can_vectorize_without_simd_p (op.code))
8249             ok = false;
8250           else
8251             if (dump_enabled_p ())
8252               dump_printf (MSG_NOTE, "proceeding using word mode.\n");
8253         }
8254
8255       if (vect_emulated_vector_p (vectype_in)
8256           && !vect_can_vectorize_without_simd_p (op.code))
8257         {
8258           if (dump_enabled_p ())
8259             dump_printf (MSG_NOTE, "using word mode not possible.\n");
8260           return false;
8261         }
8262
8263       /* lane-reducing operations have to go through vect_transform_reduction.
8264          For the other cases try without the single cycle optimization.  */
8265       if (!ok)
8266         {
8267           if (lane_reduc_code_p)
8268             return false;
8269           else
8270             single_defuse_cycle = false;
8271         }
8272     }
8273   STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
8274
8275   /* If the reduction stmt is one of the patterns that have lane
8276      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
8277   if ((ncopies > 1 && ! single_defuse_cycle)
8278       && lane_reduc_code_p)
8279     {
8280       if (dump_enabled_p ())
8281         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8282                          "multi def-use cycle not possible for lane-reducing "
8283                          "reduction operation\n");
8284       return false;
8285     }
8286
8287   if (slp_node
8288       && !(!single_defuse_cycle
8289            && !lane_reduc_code_p
8290            && reduction_type != FOLD_LEFT_REDUCTION))
8291     for (i = 0; i < (int) op.num_ops; i++)
8292       if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
8293         {
8294           if (dump_enabled_p ())
8295             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8296                              "incompatible vector types for invariants\n");
8297           return false;
8298         }
8299
8300   if (slp_node)
8301     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8302   else
8303     vec_num = 1;
8304
8305   vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
8306                              reduction_type, ncopies, cost_vec);
8307   /* Cost the reduction op inside the loop if transformed via
8308      vect_transform_reduction.  Otherwise this is costed by the
8309      separate vectorizable_* routines.  */
8310   if (single_defuse_cycle || lane_reduc_code_p)
8311     {
8312       int factor = 1;
8313       if (vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info))
8314         /* Three dot-products and a subtraction.  */
8315         factor = 4;
8316       record_stmt_cost (cost_vec, ncopies * factor, vector_stmt,
8317                         stmt_info, 0, vect_body);
8318     }
8319
8320   if (dump_enabled_p ()
8321       && reduction_type == FOLD_LEFT_REDUCTION)
8322     dump_printf_loc (MSG_NOTE, vect_location,
8323                      "using an in-order (fold-left) reduction.\n");
8324   STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
8325   /* All but single defuse-cycle optimized, lane-reducing and fold-left
8326      reductions go through their own vectorizable_* routines.  */
8327   if (!single_defuse_cycle
8328       && !lane_reduc_code_p
8329       && reduction_type != FOLD_LEFT_REDUCTION)
8330     {
8331       stmt_vec_info tem
8332         = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
8333       if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
8334         {
8335           gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
8336           tem = REDUC_GROUP_FIRST_ELEMENT (tem);
8337         }
8338       STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
8339       STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
8340     }
8341   else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8342     {
8343       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8344       vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8345       internal_fn cond_fn = get_conditional_internal_fn (op.code, op.type);
8346
8347       if (reduction_type != FOLD_LEFT_REDUCTION
8348           && !use_mask_by_cond_expr_p (op.code, cond_fn, vectype_in)
8349           && (cond_fn == IFN_LAST
8350               || !direct_internal_fn_supported_p (cond_fn, vectype_in,
8351                                                   OPTIMIZE_FOR_SPEED)))
8352         {
8353           if (dump_enabled_p ())
8354             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8355                              "can't operate on partial vectors because"
8356                              " no conditional operation is available.\n");
8357           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8358         }
8359       else if (reduction_type == FOLD_LEFT_REDUCTION
8360                && reduc_fn == IFN_LAST
8361                && !expand_vec_cond_expr_p (vectype_in,
8362                                            truth_type_for (vectype_in),
8363                                            SSA_NAME))
8364         {
8365           if (dump_enabled_p ())
8366             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8367                              "can't operate on partial vectors because"
8368                              " no conditional operation is available.\n");
8369           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8370         }
8371       else if (reduction_type == FOLD_LEFT_REDUCTION
8372                && internal_fn_mask_index (reduc_fn) == -1
8373                && FLOAT_TYPE_P (vectype_in)
8374                && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in))
8375         {
8376           if (dump_enabled_p ())
8377             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8378                              "can't operate on partial vectors because"
8379                              " signed zeros cannot be preserved.\n");
8380           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8381         }
8382       else
8383         {
8384           internal_fn mask_reduc_fn
8385             = get_masked_reduction_fn (reduc_fn, vectype_in);
8386
8387           if (mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
8388             vect_record_loop_len (loop_vinfo, lens, ncopies * vec_num,
8389                                   vectype_in, 1);
8390           else
8391             vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
8392                                    vectype_in, NULL);
8393         }
8394     }
8395   return true;
8396 }
8397
8398 /* STMT_INFO is a dot-product reduction whose multiplication operands
8399    have different signs.  Emit a sequence to emulate the operation
8400    using a series of signed DOT_PROD_EXPRs and return the last
8401    statement generated.  VEC_DEST is the result of the vector operation
8402    and VOP lists its inputs.  */
8403
8404 static gassign *
8405 vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8406                              gimple_stmt_iterator *gsi, tree vec_dest,
8407                              tree vop[3])
8408 {
8409   tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest));
8410   tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0]));
8411   tree narrow_elttype = TREE_TYPE (narrow_vectype);
8412   gimple *new_stmt;
8413
8414   /* Make VOP[0] the unsigned operand VOP[1] the signed operand.  */
8415   if (!TYPE_UNSIGNED (TREE_TYPE (vop[0])))
8416     std::swap (vop[0], vop[1]);
8417
8418   /* Convert all inputs to signed types.  */
8419   for (int i = 0; i < 3; ++i)
8420     if (TYPE_UNSIGNED (TREE_TYPE (vop[i])))
8421       {
8422         tree tmp = make_ssa_name (signed_type_for (TREE_TYPE (vop[i])));
8423         new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]);
8424         vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8425         vop[i] = tmp;
8426       }
8427
8428   /* In the comments below we assume 8-bit inputs for simplicity,
8429      but the approach works for any full integer type.  */
8430
8431   /* Create a vector of -128.  */
8432   tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype);
8433   tree min_narrow = build_vector_from_val (narrow_vectype,
8434                                            min_narrow_elttype);
8435
8436   /* Create a vector of 64.  */
8437   auto half_wi = wi::lrshift (wi::to_wide (min_narrow_elttype), 1);
8438   tree half_narrow = wide_int_to_tree (narrow_elttype, half_wi);
8439   half_narrow = build_vector_from_val (narrow_vectype, half_narrow);
8440
8441   /* Emit: SUB_RES = VOP[0] - 128.  */
8442   tree sub_res = make_ssa_name (narrow_vectype);
8443   new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow);
8444   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8445
8446   /* Emit:
8447
8448        STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
8449        STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
8450        STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
8451
8452      on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
8453      Doing the two 64 * y steps first allows more time to compute x.  */
8454   tree stage1 = make_ssa_name (wide_vectype);
8455   new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR,
8456                                   vop[1], half_narrow, vop[2]);
8457   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8458
8459   tree stage2 = make_ssa_name (wide_vectype);
8460   new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR,
8461                                   vop[1], half_narrow, stage1);
8462   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8463
8464   tree stage3 = make_ssa_name (wide_vectype);
8465   new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR,
8466                                   sub_res, vop[1], stage2);
8467   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8468
8469   /* Convert STAGE3 to the reduction type.  */
8470   return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3);
8471 }
8472
8473 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
8474    value.  */
8475
8476 bool
8477 vect_transform_reduction (loop_vec_info loop_vinfo,
8478                           stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
8479                           gimple **vec_stmt, slp_tree slp_node)
8480 {
8481   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8482   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8483   int i;
8484   int ncopies;
8485   int vec_num;
8486
8487   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8488   gcc_assert (reduc_info->is_reduc_info);
8489
8490   if (nested_in_vect_loop_p (loop, stmt_info))
8491     {
8492       loop = loop->inner;
8493       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
8494     }
8495
8496   gimple_match_op op;
8497   if (!gimple_extract_op (stmt_info->stmt, &op))
8498     gcc_unreachable ();
8499
8500   /* All uses but the last are expected to be defined in the loop.
8501      The last use is the reduction variable.  In case of nested cycle this
8502      assumption is not true: we use reduc_index to record the index of the
8503      reduction variable.  */
8504   stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
8505   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
8506   int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
8507   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8508
8509   if (slp_node)
8510     {
8511       ncopies = 1;
8512       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8513     }
8514   else
8515     {
8516       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8517       vec_num = 1;
8518     }
8519
8520   code_helper code = canonicalize_code (op.code, op.type);
8521   internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
8522
8523   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8524   vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8525   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
8526
8527   /* Transform.  */
8528   tree new_temp = NULL_TREE;
8529   auto_vec<tree> vec_oprnds0;
8530   auto_vec<tree> vec_oprnds1;
8531   auto_vec<tree> vec_oprnds2;
8532   tree def0;
8533
8534   if (dump_enabled_p ())
8535     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
8536
8537   /* FORNOW: Multiple types are not supported for condition.  */
8538   if (code == COND_EXPR)
8539     gcc_assert (ncopies == 1);
8540
8541   /* A binary COND_OP reduction must have the same definition and else
8542      value. */
8543   bool cond_fn_p = code.is_internal_fn ()
8544     && conditional_internal_fn_code (internal_fn (code)) != ERROR_MARK;
8545   if (cond_fn_p)
8546     {
8547       gcc_assert (code == IFN_COND_ADD || code == IFN_COND_SUB
8548                   || code == IFN_COND_MUL || code == IFN_COND_AND
8549                   || code == IFN_COND_IOR || code == IFN_COND_XOR);
8550       gcc_assert (op.num_ops == 4
8551                   && (op.ops[reduc_index]
8552                       == op.ops[internal_fn_else_index ((internal_fn) code)]));
8553     }
8554
8555   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8556
8557   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
8558   if (reduction_type == FOLD_LEFT_REDUCTION)
8559     {
8560       internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
8561       gcc_assert (code.is_tree_code () || cond_fn_p);
8562       return vectorize_fold_left_reduction
8563           (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
8564            code, reduc_fn, op.ops, op.num_ops, vectype_in,
8565            reduc_index, masks, lens);
8566     }
8567
8568   bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
8569   gcc_assert (single_defuse_cycle
8570               || code == DOT_PROD_EXPR
8571               || code == WIDEN_SUM_EXPR
8572               || code == SAD_EXPR);
8573
8574   /* Create the destination vector  */
8575   tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
8576   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
8577
8578   /* Get NCOPIES vector definitions for all operands except the reduction
8579      definition.  */
8580   if (!cond_fn_p)
8581     {
8582       vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
8583                          single_defuse_cycle && reduc_index == 0
8584                          ? NULL_TREE : op.ops[0], &vec_oprnds0,
8585                          single_defuse_cycle && reduc_index == 1
8586                          ? NULL_TREE : op.ops[1], &vec_oprnds1,
8587                          op.num_ops == 3
8588                          && !(single_defuse_cycle && reduc_index == 2)
8589                          ? op.ops[2] : NULL_TREE, &vec_oprnds2);
8590     }
8591   else
8592     {
8593       /* For a conditional operation pass the truth type as mask
8594          vectype.  */
8595       gcc_assert (single_defuse_cycle
8596                   && (reduc_index == 1 || reduc_index == 2));
8597       vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
8598                          op.ops[0], truth_type_for (vectype_in), &vec_oprnds0,
8599                          reduc_index == 1 ? NULL_TREE : op.ops[1],
8600                          NULL_TREE, &vec_oprnds1,
8601                          reduc_index == 2 ? NULL_TREE : op.ops[2],
8602                          NULL_TREE, &vec_oprnds2);
8603     }
8604
8605   /* For single def-use cycles get one copy of the vectorized reduction
8606      definition.  */
8607   if (single_defuse_cycle)
8608     {
8609       gcc_assert (!slp_node);
8610       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8611                                      op.ops[reduc_index],
8612                                      reduc_index == 0 ? &vec_oprnds0
8613                                      : (reduc_index == 1 ? &vec_oprnds1
8614                                         : &vec_oprnds2));
8615     }
8616
8617   bool emulated_mixed_dot_prod
8618     = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
8619   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
8620     {
8621       gimple *new_stmt;
8622       tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
8623       if (masked_loop_p && !mask_by_cond_expr)
8624         {
8625           /* No conditional ifns have been defined for dot-product yet.  */
8626           gcc_assert (code != DOT_PROD_EXPR);
8627
8628           /* Make sure that the reduction accumulator is vop[0].  */
8629           if (reduc_index == 1)
8630             {
8631               gcc_assert (commutative_binary_op_p (code, op.type));
8632               std::swap (vop[0], vop[1]);
8633             }
8634           tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8635                                           vec_num * ncopies, vectype_in, i);
8636           gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
8637                                                     vop[0], vop[1], vop[0]);
8638           new_temp = make_ssa_name (vec_dest, call);
8639           gimple_call_set_lhs (call, new_temp);
8640           gimple_call_set_nothrow (call, true);
8641           vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
8642           new_stmt = call;
8643         }
8644       else
8645         {
8646           if (op.num_ops >= 3)
8647             vop[2] = vec_oprnds2[i];
8648
8649           if (masked_loop_p && mask_by_cond_expr)
8650             {
8651               tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8652                                               vec_num * ncopies, vectype_in, i);
8653               build_vect_cond_expr (code, vop, mask, gsi);
8654             }
8655
8656           if (emulated_mixed_dot_prod)
8657             new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
8658                                                     vec_dest, vop);
8659
8660           else if (code.is_internal_fn () && !cond_fn_p)
8661             new_stmt = gimple_build_call_internal (internal_fn (code),
8662                                                    op.num_ops,
8663                                                    vop[0], vop[1], vop[2]);
8664           else if (code.is_internal_fn () && cond_fn_p)
8665             new_stmt = gimple_build_call_internal (internal_fn (code),
8666                                                    op.num_ops,
8667                                                    vop[0], vop[1], vop[2],
8668                                                    vop[1]);
8669           else
8670             new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
8671                                             vop[0], vop[1], vop[2]);
8672           new_temp = make_ssa_name (vec_dest, new_stmt);
8673           gimple_set_lhs (new_stmt, new_temp);
8674           vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8675         }
8676
8677       if (slp_node)
8678         slp_node->push_vec_def (new_stmt);
8679       else if (single_defuse_cycle
8680                && i < ncopies - 1)
8681         {
8682           if (reduc_index == 0)
8683             vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
8684           else if (reduc_index == 1)
8685             vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
8686           else if (reduc_index == 2)
8687             vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
8688         }
8689       else
8690         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8691     }
8692
8693   if (!slp_node)
8694     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8695
8696   return true;
8697 }
8698
8699 /* Transform phase of a cycle PHI.  */
8700
8701 bool
8702 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
8703                           stmt_vec_info stmt_info, gimple **vec_stmt,
8704                           slp_tree slp_node, slp_instance slp_node_instance)
8705 {
8706   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8707   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8708   int i;
8709   int ncopies;
8710   int j;
8711   bool nested_cycle = false;
8712   int vec_num;
8713
8714   if (nested_in_vect_loop_p (loop, stmt_info))
8715     {
8716       loop = loop->inner;
8717       nested_cycle = true;
8718     }
8719
8720   stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
8721   reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
8722   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8723   gcc_assert (reduc_info->is_reduc_info);
8724
8725   if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
8726       || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
8727     /* Leave the scalar phi in place.  */
8728     return true;
8729
8730   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8731   /* For a nested cycle we do not fill the above.  */
8732   if (!vectype_in)
8733     vectype_in = STMT_VINFO_VECTYPE (stmt_info);
8734   gcc_assert (vectype_in);
8735
8736   if (slp_node)
8737     {
8738       /* The size vect_schedule_slp_instance computes is off for us.  */
8739       vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
8740                                       * SLP_TREE_LANES (slp_node), vectype_in);
8741       ncopies = 1;
8742     }
8743   else
8744     {
8745       vec_num = 1;
8746       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8747     }
8748
8749   /* Check whether we should use a single PHI node and accumulate
8750      vectors to one before the backedge.  */
8751   if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
8752     ncopies = 1;
8753
8754   /* Create the destination vector  */
8755   gphi *phi = as_a <gphi *> (stmt_info->stmt);
8756   tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
8757                                                vectype_out);
8758
8759   /* Get the loop-entry arguments.  */
8760   tree vec_initial_def = NULL_TREE;
8761   auto_vec<tree> vec_initial_defs;
8762   if (slp_node)
8763     {
8764       vec_initial_defs.reserve (vec_num);
8765       if (nested_cycle)
8766         {
8767           unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
8768           vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
8769                              &vec_initial_defs);
8770         }
8771       else
8772         {
8773           gcc_assert (slp_node == slp_node_instance->reduc_phis);
8774           vec<tree> &initial_values = reduc_info->reduc_initial_values;
8775           vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
8776
8777           unsigned int num_phis = stmts.length ();
8778           if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
8779             num_phis = 1;
8780           initial_values.reserve (num_phis);
8781           for (unsigned int i = 0; i < num_phis; ++i)
8782             {
8783               gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
8784               initial_values.quick_push (vect_phi_initial_value (this_phi));
8785             }
8786           if (vec_num == 1)
8787             vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8788           if (!initial_values.is_empty ())
8789             {
8790               tree initial_value
8791                 = (num_phis == 1 ? initial_values[0] : NULL_TREE);
8792               code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8793               tree neutral_op
8794                 = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8795                                             code, initial_value);
8796               get_initial_defs_for_reduction (loop_vinfo, reduc_info,
8797                                               &vec_initial_defs, vec_num,
8798                                               stmts.length (), neutral_op);
8799             }
8800         }
8801     }
8802   else
8803     {
8804       /* Get at the scalar def before the loop, that defines the initial
8805          value of the reduction variable.  */
8806       tree initial_def = vect_phi_initial_value (phi);
8807       reduc_info->reduc_initial_values.safe_push (initial_def);
8808       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
8809          and we can't use zero for induc_val, use initial_def.  Similarly
8810          for REDUC_MIN and initial_def larger than the base.  */
8811       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
8812         {
8813           tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
8814           if (TREE_CODE (initial_def) == INTEGER_CST
8815               && !integer_zerop (induc_val)
8816               && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
8817                    && tree_int_cst_lt (initial_def, induc_val))
8818                   || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
8819                       && tree_int_cst_lt (induc_val, initial_def))))
8820             {
8821               induc_val = initial_def;
8822               /* Communicate we used the initial_def to epilouge
8823                  generation.  */
8824               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
8825             }
8826           vec_initial_def = build_vector_from_val (vectype_out, induc_val);
8827         }
8828       else if (nested_cycle)
8829         {
8830           /* Do not use an adjustment def as that case is not supported
8831              correctly if ncopies is not one.  */
8832           vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
8833                                          ncopies, initial_def,
8834                                          &vec_initial_defs);
8835         }
8836       else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
8837                || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
8838         /* Fill the initial vector with the initial scalar value.  */
8839         vec_initial_def
8840           = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
8841                                            initial_def, initial_def);
8842       else
8843         {
8844           if (ncopies == 1)
8845             vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8846           if (!reduc_info->reduc_initial_values.is_empty ())
8847             {
8848               initial_def = reduc_info->reduc_initial_values[0];
8849               code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8850               tree neutral_op
8851                 = neutral_op_for_reduction (TREE_TYPE (initial_def),
8852                                             code, initial_def);
8853               gcc_assert (neutral_op);
8854               /* Try to simplify the vector initialization by applying an
8855                  adjustment after the reduction has been performed.  */
8856               if (!reduc_info->reused_accumulator
8857                   && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8858                   && !operand_equal_p (neutral_op, initial_def))
8859                 {
8860                   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
8861                     = initial_def;
8862                   initial_def = neutral_op;
8863                 }
8864               vec_initial_def
8865                 = get_initial_def_for_reduction (loop_vinfo, reduc_info,
8866                                                  initial_def, neutral_op);
8867             }
8868         }
8869     }
8870
8871   if (vec_initial_def)
8872     {
8873       vec_initial_defs.create (ncopies);
8874       for (i = 0; i < ncopies; ++i)
8875         vec_initial_defs.quick_push (vec_initial_def);
8876     }
8877
8878   if (auto *accumulator = reduc_info->reused_accumulator)
8879     {
8880       tree def = accumulator->reduc_input;
8881       if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8882         {
8883           unsigned int nreduc;
8884           bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
8885                                             (TREE_TYPE (def)),
8886                                           TYPE_VECTOR_SUBPARTS (vectype_out),
8887                                           &nreduc);
8888           gcc_assert (res);
8889           gimple_seq stmts = NULL;
8890           /* Reduce the single vector to a smaller one.  */
8891           if (nreduc != 1)
8892             {
8893               /* Perform the reduction in the appropriate type.  */
8894               tree rvectype = vectype_out;
8895               if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
8896                                               TREE_TYPE (TREE_TYPE (def))))
8897                 rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
8898                                               TYPE_VECTOR_SUBPARTS
8899                                                 (vectype_out));
8900               def = vect_create_partial_epilog (def, rvectype,
8901                                                 STMT_VINFO_REDUC_CODE
8902                                                   (reduc_info),
8903                                                 &stmts);
8904             }
8905           /* The epilogue loop might use a different vector mode, like
8906              VNx2DI vs. V2DI.  */
8907           if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
8908             {
8909               tree reduc_type = build_vector_type_for_mode
8910                 (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
8911               def = gimple_convert (&stmts, reduc_type, def);
8912             }
8913           /* Adjust the input so we pick up the partially reduced value
8914              for the skip edge in vect_create_epilog_for_reduction.  */
8915           accumulator->reduc_input = def;
8916           /* And the reduction could be carried out using a different sign.  */
8917           if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8918             def = gimple_convert (&stmts, vectype_out, def);
8919           if (loop_vinfo->main_loop_edge)
8920             {
8921               /* While we'd like to insert on the edge this will split
8922                  blocks and disturb bookkeeping, we also will eventually
8923                  need this on the skip edge.  Rely on sinking to
8924                  fixup optimal placement and insert in the pred.  */
8925               gimple_stmt_iterator gsi
8926                 = gsi_last_bb (loop_vinfo->main_loop_edge->src);
8927               /* Insert before a cond that eventually skips the
8928                  epilogue.  */
8929               if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
8930                 gsi_prev (&gsi);
8931               gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
8932             }
8933           else
8934             gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
8935                                               stmts);
8936         }
8937       if (loop_vinfo->main_loop_edge)
8938         vec_initial_defs[0]
8939           = vect_get_main_loop_result (loop_vinfo, def,
8940                                        vec_initial_defs[0]);
8941       else
8942         vec_initial_defs.safe_push (def);
8943     }
8944
8945   /* Generate the reduction PHIs upfront.  */
8946   for (i = 0; i < vec_num; i++)
8947     {
8948       tree vec_init_def = vec_initial_defs[i];
8949       for (j = 0; j < ncopies; j++)
8950         {
8951           /* Create the reduction-phi that defines the reduction
8952              operand.  */
8953           gphi *new_phi = create_phi_node (vec_dest, loop->header);
8954
8955           /* Set the loop-entry arg of the reduction-phi.  */
8956           if (j != 0 && nested_cycle)
8957             vec_init_def = vec_initial_defs[j];
8958           add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
8959                        UNKNOWN_LOCATION);
8960
8961           /* The loop-latch arg is set in epilogue processing.  */
8962
8963           if (slp_node)
8964             slp_node->push_vec_def (new_phi);
8965           else
8966             {
8967               if (j == 0)
8968                 *vec_stmt = new_phi;
8969               STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8970             }
8971         }
8972     }
8973
8974   return true;
8975 }
8976
8977 /* Vectorizes LC PHIs.  */
8978
8979 bool
8980 vectorizable_lc_phi (loop_vec_info loop_vinfo,
8981                      stmt_vec_info stmt_info, gimple **vec_stmt,
8982                      slp_tree slp_node)
8983 {
8984   if (!loop_vinfo
8985       || !is_a <gphi *> (stmt_info->stmt)
8986       || gimple_phi_num_args (stmt_info->stmt) != 1)
8987     return false;
8988
8989   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
8990       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
8991     return false;
8992
8993   if (!vec_stmt) /* transformation not required.  */
8994     {
8995       /* Deal with copies from externs or constants that disguise as
8996          loop-closed PHI nodes (PR97886).  */
8997       if (slp_node
8998           && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
8999                                                 SLP_TREE_VECTYPE (slp_node)))
9000         {
9001           if (dump_enabled_p ())
9002             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9003                              "incompatible vector types for invariants\n");
9004           return false;
9005         }
9006       STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
9007       return true;
9008     }
9009
9010   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9011   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
9012   basic_block bb = gimple_bb (stmt_info->stmt);
9013   edge e = single_pred_edge (bb);
9014   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
9015   auto_vec<tree> vec_oprnds;
9016   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
9017                      !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
9018                      gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
9019   for (unsigned i = 0; i < vec_oprnds.length (); i++)
9020     {
9021       /* Create the vectorized LC PHI node.  */
9022       gphi *new_phi = create_phi_node (vec_dest, bb);
9023       add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
9024       if (slp_node)
9025         slp_node->push_vec_def (new_phi);
9026       else
9027         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
9028     }
9029   if (!slp_node)
9030     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
9031
9032   return true;
9033 }
9034
9035 /* Vectorizes PHIs.  */
9036
9037 bool
9038 vectorizable_phi (vec_info *,
9039                   stmt_vec_info stmt_info, gimple **vec_stmt,
9040                   slp_tree slp_node, stmt_vector_for_cost *cost_vec)
9041 {
9042   if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
9043     return false;
9044
9045   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
9046     return false;
9047
9048   tree vectype = SLP_TREE_VECTYPE (slp_node);
9049
9050   if (!vec_stmt) /* transformation not required.  */
9051     {
9052       slp_tree child;
9053       unsigned i;
9054       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
9055         if (!child)
9056           {
9057             if (dump_enabled_p ())
9058               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9059                                "PHI node with unvectorized backedge def\n");
9060             return false;
9061           }
9062         else if (!vect_maybe_update_slp_op_vectype (child, vectype))
9063           {
9064             if (dump_enabled_p ())
9065               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9066                                "incompatible vector types for invariants\n");
9067             return false;
9068           }
9069         else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
9070                  && !useless_type_conversion_p (vectype,
9071                                                 SLP_TREE_VECTYPE (child)))
9072           {
9073             /* With bools we can have mask and non-mask precision vectors
9074                or different non-mask precisions.  while pattern recog is
9075                supposed to guarantee consistency here bugs in it can cause
9076                mismatches (PR103489 and PR103800 for example).
9077                Deal with them here instead of ICEing later.  */
9078             if (dump_enabled_p ())
9079               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9080                                "incompatible vector type setup from "
9081                                "bool pattern detection\n");
9082             return false;
9083           }
9084
9085       /* For single-argument PHIs assume coalescing which means zero cost
9086          for the scalar and the vector PHIs.  This avoids artificially
9087          favoring the vector path (but may pessimize it in some cases).  */
9088       if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
9089         record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
9090                           vector_stmt, stmt_info, vectype, 0, vect_body);
9091       STMT_VINFO_TYPE (stmt_info) = phi_info_type;
9092       return true;
9093     }
9094
9095   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
9096   basic_block bb = gimple_bb (stmt_info->stmt);
9097   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
9098   auto_vec<gphi *> new_phis;
9099   for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
9100     {
9101       slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
9102
9103       /* Skip not yet vectorized defs.  */
9104       if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
9105           && SLP_TREE_VEC_DEFS (child).is_empty ())
9106         continue;
9107
9108       auto_vec<tree> vec_oprnds;
9109       vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
9110       if (!new_phis.exists ())
9111         {
9112           new_phis.create (vec_oprnds.length ());
9113           for (unsigned j = 0; j < vec_oprnds.length (); j++)
9114             {
9115               /* Create the vectorized LC PHI node.  */
9116               new_phis.quick_push (create_phi_node (vec_dest, bb));
9117               slp_node->push_vec_def (new_phis[j]);
9118             }
9119         }
9120       edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
9121       for (unsigned j = 0; j < vec_oprnds.length (); j++)
9122         add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
9123     }
9124   /* We should have at least one already vectorized child.  */
9125   gcc_assert (new_phis.exists ());
9126
9127   return true;
9128 }
9129
9130 /* Vectorizes first order recurrences.  An overview of the transformation
9131    is described below. Suppose we have the following loop.
9132
9133      int t = 0;
9134      for (int i = 0; i < n; ++i)
9135        {
9136          b[i] = a[i] - t;
9137          t = a[i];
9138        }
9139
9140    There is a first-order recurrence on 'a'. For this loop, the scalar IR
9141    looks (simplified) like:
9142
9143     scalar.preheader:
9144       init = 0;
9145
9146     scalar.body:
9147       i = PHI <0(scalar.preheader), i+1(scalar.body)>
9148       _2 = PHI <(init(scalar.preheader), <_1(scalar.body)>
9149       _1 = a[i]
9150       b[i] = _1 - _2
9151       if (i < n) goto scalar.body
9152
9153    In this example, _2 is a recurrence because it's value depends on the
9154    previous iteration.  We vectorize this as (VF = 4)
9155
9156     vector.preheader:
9157       vect_init = vect_cst(..., ..., ..., 0)
9158
9159     vector.body
9160       i = PHI <0(vector.preheader), i+4(vector.body)>
9161       vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)>
9162       vect_2 = a[i, i+1, i+2, i+3];
9163       vect_3 = vec_perm (vect_1, vect_2, { 3, 4, 5, 6 })
9164       b[i, i+1, i+2, i+3] = vect_2 - vect_3
9165       if (..) goto vector.body
9166
9167    In this function, vectorizable_recurr, we code generate both the
9168    vector PHI node and the permute since those together compute the
9169    vectorized value of the scalar PHI.  We do not yet have the
9170    backedge value to fill in there nor into the vec_perm.  Those
9171    are filled in maybe_set_vectorized_backedge_value and
9172    vect_schedule_scc.
9173
9174    TODO:  Since the scalar loop does not have a use of the recurrence
9175    outside of the loop the natural way to implement peeling via
9176    vectorizing the live value doesn't work.  For now peeling of loops
9177    with a recurrence is not implemented.  For SLP the supported cases
9178    are restricted to those requiring a single vector recurrence PHI.  */
9179
9180 bool
9181 vectorizable_recurr (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
9182                      gimple **vec_stmt, slp_tree slp_node,
9183                      stmt_vector_for_cost *cost_vec)
9184 {
9185   if (!loop_vinfo || !is_a<gphi *> (stmt_info->stmt))
9186     return false;
9187
9188   gphi *phi = as_a<gphi *> (stmt_info->stmt);
9189
9190   /* So far we only support first-order recurrence auto-vectorization.  */
9191   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
9192     return false;
9193
9194   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9195   unsigned ncopies;
9196   if (slp_node)
9197     ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
9198   else
9199     ncopies = vect_get_num_copies (loop_vinfo, vectype);
9200   poly_int64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9201   unsigned dist = slp_node ? SLP_TREE_LANES (slp_node) : 1;
9202   /* We need to be able to make progress with a single vector.  */
9203   if (maybe_gt (dist * 2, nunits))
9204     {
9205       if (dump_enabled_p ())
9206         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9207                          "first order recurrence exceeds half of "
9208                          "a vector\n");
9209       return false;
9210     }
9211
9212   /* First-order recurrence autovectorization needs to handle permutation
9213      with indices = [nunits-1, nunits, nunits+1, ...].  */
9214   vec_perm_builder sel (nunits, 1, 3);
9215   for (int i = 0; i < 3; ++i)
9216     sel.quick_push (nunits - dist + i);
9217   vec_perm_indices indices (sel, 2, nunits);
9218
9219   if (!vec_stmt) /* transformation not required.  */
9220     {
9221       if (!can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
9222                                  indices))
9223         return false;
9224
9225       if (slp_node)
9226         {
9227           /* We eventually need to set a vector type on invariant
9228              arguments.  */
9229           unsigned j;
9230           slp_tree child;
9231           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9232             if (!vect_maybe_update_slp_op_vectype
9233                   (child, SLP_TREE_VECTYPE (slp_node)))
9234               {
9235                 if (dump_enabled_p ())
9236                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9237                                    "incompatible vector types for "
9238                                    "invariants\n");
9239                 return false;
9240               }
9241         }
9242       /* The recurrence costs the initialization vector and one permute
9243          for each copy.  */
9244       unsigned prologue_cost = record_stmt_cost (cost_vec, 1, scalar_to_vec,
9245                                                  stmt_info, 0, vect_prologue);
9246       unsigned inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9247                                                stmt_info, 0, vect_body);
9248       if (dump_enabled_p ())
9249         dump_printf_loc (MSG_NOTE, vect_location,
9250                          "vectorizable_recurr: inside_cost = %d, "
9251                          "prologue_cost = %d .\n", inside_cost,
9252                          prologue_cost);
9253
9254       STMT_VINFO_TYPE (stmt_info) = recurr_info_type;
9255       return true;
9256     }
9257
9258   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
9259   basic_block bb = gimple_bb (phi);
9260   tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
9261   if (!useless_type_conversion_p (TREE_TYPE (vectype), TREE_TYPE (preheader)))
9262     {
9263       gimple_seq stmts = NULL;
9264       preheader = gimple_convert (&stmts, TREE_TYPE (vectype), preheader);
9265       gsi_insert_seq_on_edge_immediate (pe, stmts);
9266     }
9267   tree vec_init = build_vector_from_val (vectype, preheader);
9268   vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL);
9269
9270   /* Create the vectorized first-order PHI node.  */
9271   tree vec_dest = vect_get_new_vect_var (vectype,
9272                                          vect_simple_var, "vec_recur_");
9273   gphi *new_phi = create_phi_node (vec_dest, bb);
9274   add_phi_arg (new_phi, vec_init, pe, UNKNOWN_LOCATION);
9275
9276   /* Insert shuffles the first-order recurrence autovectorization.
9277        result = VEC_PERM <vec_recur, vect_1, index[nunits-1, nunits, ...]>.  */
9278   tree perm = vect_gen_perm_mask_checked (vectype, indices);
9279
9280   /* Insert the required permute after the latch definition.  The
9281      second and later operands are tentative and will be updated when we have
9282      vectorized the latch definition.  */
9283   edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
9284   gimple *latch_def = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, le));
9285   gimple_stmt_iterator gsi2 = gsi_for_stmt (latch_def);
9286   gsi_next (&gsi2);
9287
9288   for (unsigned i = 0; i < ncopies; ++i)
9289     {
9290       vec_dest = make_ssa_name (vectype);
9291       gassign *vperm
9292           = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
9293                                  i == 0 ? gimple_phi_result (new_phi) : NULL,
9294                                  NULL, perm);
9295       vect_finish_stmt_generation (loop_vinfo, stmt_info, vperm, &gsi2);
9296
9297       if (slp_node)
9298         slp_node->push_vec_def (vperm);
9299       else
9300         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (vperm);
9301     }
9302
9303   if (!slp_node)
9304     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
9305   return true;
9306 }
9307
9308 /* Return true if VECTYPE represents a vector that requires lowering
9309    by the vector lowering pass.  */
9310
9311 bool
9312 vect_emulated_vector_p (tree vectype)
9313 {
9314   return (!VECTOR_MODE_P (TYPE_MODE (vectype))
9315           && (!VECTOR_BOOLEAN_TYPE_P (vectype)
9316               || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
9317 }
9318
9319 /* Return true if we can emulate CODE on an integer mode representation
9320    of a vector.  */
9321
9322 bool
9323 vect_can_vectorize_without_simd_p (tree_code code)
9324 {
9325   switch (code)
9326     {
9327     case PLUS_EXPR:
9328     case MINUS_EXPR:
9329     case NEGATE_EXPR:
9330     case BIT_AND_EXPR:
9331     case BIT_IOR_EXPR:
9332     case BIT_XOR_EXPR:
9333     case BIT_NOT_EXPR:
9334       return true;
9335
9336     default:
9337       return false;
9338     }
9339 }
9340
9341 /* Likewise, but taking a code_helper.  */
9342
9343 bool
9344 vect_can_vectorize_without_simd_p (code_helper code)
9345 {
9346   return (code.is_tree_code ()
9347           && vect_can_vectorize_without_simd_p (tree_code (code)));
9348 }
9349
9350 /* Create vector init for vectorized iv.  */
9351 static tree
9352 vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9353                                tree step_expr, poly_uint64 nunits,
9354                                tree vectype,
9355                                enum vect_induction_op_type induction_type)
9356 {
9357   unsigned HOST_WIDE_INT const_nunits;
9358   tree vec_shift, vec_init, new_name;
9359   unsigned i;
9360   tree itype = TREE_TYPE (vectype);
9361
9362   /* iv_loop is the loop to be vectorized. Create:
9363      vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr).  */
9364   new_name = gimple_convert (stmts, itype, init_expr);
9365   switch (induction_type)
9366     {
9367     case vect_step_op_shr:
9368     case vect_step_op_shl:
9369       /* Build the Initial value from shift_expr.  */
9370       vec_init = gimple_build_vector_from_val (stmts,
9371                                                vectype,
9372                                                new_name);
9373       vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
9374                                 build_zero_cst (itype), step_expr);
9375       vec_init = gimple_build (stmts,
9376                                (induction_type == vect_step_op_shr
9377                                 ? RSHIFT_EXPR : LSHIFT_EXPR),
9378                                vectype, vec_init, vec_shift);
9379       break;
9380
9381     case vect_step_op_neg:
9382       {
9383         vec_init = gimple_build_vector_from_val (stmts,
9384                                                  vectype,
9385                                                  new_name);
9386         tree vec_neg = gimple_build (stmts, NEGATE_EXPR,
9387                                      vectype, vec_init);
9388         /* The encoding has 2 interleaved stepped patterns.  */
9389         vec_perm_builder sel (nunits, 2, 3);
9390         sel.quick_grow (6);
9391         for (i = 0; i < 3; i++)
9392           {
9393             sel[2 * i] = i;
9394             sel[2 * i + 1] = i + nunits;
9395           }
9396         vec_perm_indices indices (sel, 2, nunits);
9397         /* Don't use vect_gen_perm_mask_checked since can_vec_perm_const_p may
9398            fail when vec_init is const vector. In that situation vec_perm is not
9399            really needed.  */
9400         tree perm_mask_even
9401           = vect_gen_perm_mask_any (vectype, indices);
9402         vec_init = gimple_build (stmts, VEC_PERM_EXPR,
9403                                  vectype,
9404                                  vec_init, vec_neg,
9405                                  perm_mask_even);
9406       }
9407       break;
9408
9409     case vect_step_op_mul:
9410       {
9411         /* Use unsigned mult to avoid UD integer overflow.  */
9412         gcc_assert (nunits.is_constant (&const_nunits));
9413         tree utype = unsigned_type_for (itype);
9414         tree uvectype = build_vector_type (utype,
9415                                            TYPE_VECTOR_SUBPARTS (vectype));
9416         new_name = gimple_convert (stmts, utype, new_name);
9417         vec_init = gimple_build_vector_from_val (stmts,
9418                                                  uvectype,
9419                                                  new_name);
9420         tree_vector_builder elts (uvectype, const_nunits, 1);
9421         tree elt_step = build_one_cst (utype);
9422
9423         elts.quick_push (elt_step);
9424         for (i = 1; i < const_nunits; i++)
9425           {
9426             /* Create: new_name_i = new_name + step_expr.  */
9427             elt_step = gimple_build (stmts, MULT_EXPR,
9428                                      utype, elt_step, step_expr);
9429             elts.quick_push (elt_step);
9430           }
9431         /* Create a vector from [new_name_0, new_name_1, ...,
9432            new_name_nunits-1].  */
9433         tree vec_mul = gimple_build_vector (stmts, &elts);
9434         vec_init = gimple_build (stmts, MULT_EXPR, uvectype,
9435                                  vec_init, vec_mul);
9436         vec_init = gimple_convert (stmts, vectype, vec_init);
9437       }
9438       break;
9439
9440     default:
9441       gcc_unreachable ();
9442     }
9443
9444   return vec_init;
9445 }
9446
9447 /* Peel init_expr by skip_niter for induction_type.  */
9448 tree
9449 vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9450                              tree skip_niters, tree step_expr,
9451                              enum vect_induction_op_type induction_type)
9452 {
9453   gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST);
9454   tree type = TREE_TYPE (init_expr);
9455   unsigned prec = TYPE_PRECISION (type);
9456   switch (induction_type)
9457     {
9458     case vect_step_op_neg:
9459       if (TREE_INT_CST_LOW (skip_niters) % 2)
9460         init_expr = gimple_build (stmts, NEGATE_EXPR, type, init_expr);
9461       /* else no change.  */
9462       break;
9463
9464     case vect_step_op_shr:
9465     case vect_step_op_shl:
9466       skip_niters = gimple_convert (stmts, type, skip_niters);
9467       step_expr = gimple_build (stmts, MULT_EXPR, type, step_expr, skip_niters);
9468       /* When shift mount >= precision, need to avoid UD.
9469          In the original loop, there's no UD, and according to semantic,
9470          init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr.  */
9471       if (!tree_fits_uhwi_p (step_expr)
9472           || tree_to_uhwi (step_expr) >= prec)
9473         {
9474           if (induction_type == vect_step_op_shl
9475               || TYPE_UNSIGNED (type))
9476             init_expr = build_zero_cst (type);
9477           else
9478             init_expr = gimple_build (stmts, RSHIFT_EXPR, type,
9479                                       init_expr,
9480                                       wide_int_to_tree (type, prec - 1));
9481         }
9482       else
9483         init_expr = gimple_build (stmts, (induction_type == vect_step_op_shr
9484                                           ? RSHIFT_EXPR : LSHIFT_EXPR),
9485                                   type, init_expr, step_expr);
9486       break;
9487
9488     case vect_step_op_mul:
9489       {
9490         tree utype = unsigned_type_for (type);
9491         init_expr = gimple_convert (stmts, utype, init_expr);
9492         wide_int skipn = wi::to_wide (skip_niters);
9493         wide_int begin = wi::to_wide (step_expr);
9494         auto_mpz base, exp, mod, res;
9495         wi::to_mpz (begin, base, TYPE_SIGN (type));
9496         wi::to_mpz (skipn, exp, UNSIGNED);
9497         mpz_ui_pow_ui (mod, 2, TYPE_PRECISION (type));
9498         mpz_powm (res, base, exp, mod);
9499         begin = wi::from_mpz (type, res, TYPE_SIGN (type));
9500         tree mult_expr = wide_int_to_tree (utype, begin);
9501         init_expr = gimple_build (stmts, MULT_EXPR, utype,
9502                                   init_expr, mult_expr);
9503         init_expr = gimple_convert (stmts, type, init_expr);
9504       }
9505       break;
9506
9507     default:
9508       gcc_unreachable ();
9509     }
9510
9511   return init_expr;
9512 }
9513
9514 /* Create vector step for vectorized iv.  */
9515 static tree
9516 vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
9517                                poly_uint64 vf,
9518                                enum vect_induction_op_type induction_type)
9519 {
9520   tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
9521   tree new_name = NULL;
9522   /* Step should be pow (step, vf) for mult induction.  */
9523   if (induction_type == vect_step_op_mul)
9524     {
9525       gcc_assert (vf.is_constant ());
9526       wide_int begin = wi::to_wide (step_expr);
9527
9528       for (unsigned i = 0; i != vf.to_constant () - 1; i++)
9529         begin = wi::mul (begin, wi::to_wide (step_expr));
9530
9531       new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin);
9532     }
9533   else if (induction_type == vect_step_op_neg)
9534     /* Do nothing.  */
9535     ;
9536   else
9537     new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr),
9538                              expr, step_expr);
9539   return new_name;
9540 }
9541
9542 static tree
9543 vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
9544                                    stmt_vec_info stmt_info,
9545                                    tree new_name, tree vectype,
9546                                    enum vect_induction_op_type induction_type)
9547 {
9548   /* No step is needed for neg induction.  */
9549   if (induction_type == vect_step_op_neg)
9550     return NULL;
9551
9552   tree t = unshare_expr (new_name);
9553   gcc_assert (CONSTANT_CLASS_P (new_name)
9554               || TREE_CODE (new_name) == SSA_NAME);
9555   tree new_vec = build_vector_from_val (vectype, t);
9556   tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
9557                                     new_vec, vectype, NULL);
9558   return vec_step;
9559 }
9560
9561 /* Update vectorized iv with vect_step, induc_def is init.  */
9562 static tree
9563 vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
9564                           tree induc_def, tree vec_step,
9565                           enum vect_induction_op_type induction_type)
9566 {
9567   tree vec_def = induc_def;
9568   switch (induction_type)
9569     {
9570     case vect_step_op_mul:
9571       {
9572         /* Use unsigned mult to avoid UD integer overflow.  */
9573         tree uvectype
9574           = build_vector_type (unsigned_type_for (TREE_TYPE (vectype)),
9575                                TYPE_VECTOR_SUBPARTS (vectype));
9576         vec_def = gimple_convert (stmts, uvectype, vec_def);
9577         vec_step = gimple_convert (stmts, uvectype, vec_step);
9578         vec_def = gimple_build (stmts, MULT_EXPR, uvectype,
9579                                 vec_def, vec_step);
9580         vec_def = gimple_convert (stmts, vectype, vec_def);
9581       }
9582       break;
9583
9584     case vect_step_op_shr:
9585       vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype,
9586                               vec_def, vec_step);
9587       break;
9588
9589     case vect_step_op_shl:
9590       vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype,
9591                               vec_def, vec_step);
9592       break;
9593     case vect_step_op_neg:
9594       vec_def = induc_def;
9595       /* Do nothing.  */
9596       break;
9597     default:
9598       gcc_unreachable ();
9599     }
9600
9601   return vec_def;
9602
9603 }
9604
9605 /* Function vectorizable_induction
9606
9607    Check if STMT_INFO performs an nonlinear induction computation that can be
9608    vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
9609    a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
9610    basic block.
9611    Return true if STMT_INFO is vectorizable in this way.  */
9612
9613 static bool
9614 vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
9615                                   stmt_vec_info stmt_info,
9616                                   gimple **vec_stmt, slp_tree slp_node,
9617                                   stmt_vector_for_cost *cost_vec)
9618 {
9619   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9620   unsigned ncopies;
9621   bool nested_in_vect_loop = false;
9622   class loop *iv_loop;
9623   tree vec_def;
9624   edge pe = loop_preheader_edge (loop);
9625   basic_block new_bb;
9626   tree vec_init, vec_step;
9627   tree new_name;
9628   gimple *new_stmt;
9629   gphi *induction_phi;
9630   tree induc_def, vec_dest;
9631   tree init_expr, step_expr;
9632   tree niters_skip;
9633   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9634   unsigned i;
9635   gimple_stmt_iterator si;
9636
9637   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9638
9639   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9640   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9641   enum vect_induction_op_type induction_type
9642     = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9643
9644   gcc_assert (induction_type > vect_step_op_add);
9645
9646   if (slp_node)
9647     ncopies = 1;
9648   else
9649     ncopies = vect_get_num_copies (loop_vinfo, vectype);
9650   gcc_assert (ncopies >= 1);
9651
9652   /* FORNOW. Only handle nonlinear induction in the same loop.  */
9653   if (nested_in_vect_loop_p (loop, stmt_info))
9654     {
9655       if (dump_enabled_p ())
9656         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9657                          "nonlinear induction in nested loop.\n");
9658       return false;
9659     }
9660
9661   iv_loop = loop;
9662   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9663
9664   /* TODO: Support slp for nonlinear iv. There should be separate vector iv
9665      update for each iv and a permutation to generate wanted vector iv.  */
9666   if (slp_node)
9667     {
9668       if (dump_enabled_p ())
9669         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9670                          "SLP induction not supported for nonlinear"
9671                          " induction.\n");
9672       return false;
9673     }
9674
9675   if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
9676     {
9677       if (dump_enabled_p ())
9678         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9679                          "floating point nonlinear induction vectorization"
9680                          " not supported.\n");
9681       return false;
9682     }
9683
9684   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9685   init_expr = vect_phi_initial_value (phi);
9686   gcc_assert (step_expr != NULL_TREE && init_expr != NULL
9687               && TREE_CODE (step_expr) == INTEGER_CST);
9688   /* step_expr should be aligned with init_expr,
9689      .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used.  */
9690   step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
9691
9692   if (TREE_CODE (init_expr) == INTEGER_CST)
9693     init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
9694   else if (!tree_nop_conversion_p (TREE_TYPE (vectype), TREE_TYPE (init_expr)))
9695     {
9696       /* INIT_EXPR could be a bit_field, bail out for such case.  */
9697       if (dump_enabled_p ())
9698         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9699                          "nonlinear induction vectorization failed:"
9700                          " component type of vectype is not a nop conversion"
9701                          " from type of init_expr.\n");
9702       return false;
9703     }
9704
9705   switch (induction_type)
9706     {
9707     case vect_step_op_neg:
9708       if (TREE_CODE (init_expr) != INTEGER_CST
9709           && TREE_CODE (init_expr) != REAL_CST)
9710         {
9711           /* Check for backend support of NEGATE_EXPR and vec_perm.  */
9712           if (!directly_supported_p (NEGATE_EXPR, vectype))
9713             return false;
9714
9715           /* The encoding has 2 interleaved stepped patterns.  */
9716           vec_perm_builder sel (nunits, 2, 3);
9717           machine_mode mode = TYPE_MODE (vectype);
9718           sel.quick_grow (6);
9719           for (i = 0; i < 3; i++)
9720             {
9721               sel[i * 2] = i;
9722               sel[i * 2 + 1] = i + nunits;
9723             }
9724           vec_perm_indices indices (sel, 2, nunits);
9725           if (!can_vec_perm_const_p (mode, mode, indices))
9726             return false;
9727         }
9728       break;
9729
9730     case vect_step_op_mul:
9731       {
9732         /* Check for backend support of MULT_EXPR.  */
9733         if (!directly_supported_p (MULT_EXPR, vectype))
9734           return false;
9735
9736         /* ?? How to construct vector step for variable number vector.
9737            [ 1, step, pow (step, 2), pow (step, 4), .. ].  */
9738         if (!vf.is_constant ())
9739           return false;
9740       }
9741       break;
9742
9743     case vect_step_op_shr:
9744       /* Check for backend support of RSHIFT_EXPR.  */
9745       if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
9746         return false;
9747
9748       /* Don't shift more than type precision to avoid UD.  */
9749       if (!tree_fits_uhwi_p (step_expr)
9750           || maybe_ge (nunits * tree_to_uhwi (step_expr),
9751                        TYPE_PRECISION (TREE_TYPE (init_expr))))
9752         return false;
9753       break;
9754
9755     case vect_step_op_shl:
9756       /* Check for backend support of RSHIFT_EXPR.  */
9757       if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
9758         return false;
9759
9760       /* Don't shift more than type precision to avoid UD.  */
9761       if (!tree_fits_uhwi_p (step_expr)
9762           || maybe_ge (nunits * tree_to_uhwi (step_expr),
9763                        TYPE_PRECISION (TREE_TYPE (init_expr))))
9764         return false;
9765
9766       break;
9767
9768     default:
9769       gcc_unreachable ();
9770     }
9771
9772   if (!vec_stmt) /* transformation not required.  */
9773     {
9774       unsigned inside_cost = 0, prologue_cost = 0;
9775       /* loop cost for vec_loop. Neg induction doesn't have any
9776          inside_cost.  */
9777       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9778                                       stmt_info, 0, vect_body);
9779
9780       /* loop cost for vec_loop. Neg induction doesn't have any
9781          inside_cost.  */
9782       if (induction_type == vect_step_op_neg)
9783         inside_cost = 0;
9784
9785       /* prologue cost for vec_init and vec_step.  */
9786       prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9787                                         stmt_info, 0, vect_prologue);
9788
9789       if (dump_enabled_p ())
9790         dump_printf_loc (MSG_NOTE, vect_location,
9791                          "vect_model_induction_cost: inside_cost = %d, "
9792                          "prologue_cost = %d. \n", inside_cost,
9793                          prologue_cost);
9794
9795       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9796       DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
9797       return true;
9798     }
9799
9800   /* Transform.  */
9801
9802   /* Compute a vector variable, initialized with the first VF values of
9803      the induction variable.  E.g., for an iv with IV_PHI='X' and
9804      evolution S, for a vector of 4 units, we want to compute:
9805      [X, X + S, X + 2*S, X + 3*S].  */
9806
9807   if (dump_enabled_p ())
9808     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9809
9810   pe = loop_preheader_edge (iv_loop);
9811   /* Find the first insertion point in the BB.  */
9812   basic_block bb = gimple_bb (phi);
9813   si = gsi_after_labels (bb);
9814
9815   gimple_seq stmts = NULL;
9816
9817   niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9818   /* If we are using the loop mask to "peel" for alignment then we need
9819      to adjust the start value here.  */
9820   if (niters_skip != NULL_TREE)
9821     init_expr = vect_peel_nonlinear_iv_init (&stmts, init_expr, niters_skip,
9822                                              step_expr, induction_type);
9823
9824   vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr,
9825                                             step_expr, nunits, vectype,
9826                                             induction_type);
9827   if (stmts)
9828     {
9829       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9830       gcc_assert (!new_bb);
9831     }
9832
9833   stmts = NULL;
9834   new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9835                                             vf, induction_type);
9836   if (stmts)
9837     {
9838       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9839       gcc_assert (!new_bb);
9840     }
9841
9842   vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9843                                                 new_name, vectype,
9844                                                 induction_type);
9845   /* Create the following def-use cycle:
9846      loop prolog:
9847      vec_init = ...
9848      vec_step = ...
9849      loop:
9850      vec_iv = PHI <vec_init, vec_loop>
9851      ...
9852      STMT
9853      ...
9854      vec_loop = vec_iv + vec_step;  */
9855
9856   /* Create the induction-phi that defines the induction-operand.  */
9857   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
9858   induction_phi = create_phi_node (vec_dest, iv_loop->header);
9859   induc_def = PHI_RESULT (induction_phi);
9860
9861   /* Create the iv update inside the loop.  */
9862   stmts = NULL;
9863   vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9864                                       induc_def, vec_step,
9865                                       induction_type);
9866
9867   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9868   new_stmt = SSA_NAME_DEF_STMT (vec_def);
9869
9870   /* Set the arguments of the phi node:  */
9871   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9872   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9873                UNKNOWN_LOCATION);
9874
9875   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
9876   *vec_stmt = induction_phi;
9877
9878   /* In case that vectorization factor (VF) is bigger than the number
9879      of elements that we can fit in a vectype (nunits), we have to generate
9880      more than one vector stmt - i.e - we need to "unroll" the
9881      vector stmt by a factor VF/nunits.  For more details see documentation
9882      in vectorizable_operation.  */
9883
9884   if (ncopies > 1)
9885     {
9886       stmts = NULL;
9887       /* FORNOW. This restriction should be relaxed.  */
9888       gcc_assert (!nested_in_vect_loop);
9889
9890       new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9891                                                 nunits, induction_type);
9892
9893       vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9894                                                     new_name, vectype,
9895                                                     induction_type);
9896       vec_def = induc_def;
9897       for (i = 1; i < ncopies; i++)
9898         {
9899           /* vec_i = vec_prev + vec_step.  */
9900           stmts = NULL;
9901           vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9902                                               vec_def, vec_step,
9903                                               induction_type);
9904           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9905           new_stmt = SSA_NAME_DEF_STMT (vec_def);
9906           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
9907         }
9908     }
9909
9910   if (dump_enabled_p ())
9911     dump_printf_loc (MSG_NOTE, vect_location,
9912                      "transform induction: created def-use cycle: %G%G",
9913                      (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
9914
9915   return true;
9916 }
9917
9918 /* Function vectorizable_induction
9919
9920    Check if STMT_INFO performs an induction computation that can be vectorized.
9921    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
9922    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
9923    Return true if STMT_INFO is vectorizable in this way.  */
9924
9925 bool
9926 vectorizable_induction (loop_vec_info loop_vinfo,
9927                         stmt_vec_info stmt_info,
9928                         gimple **vec_stmt, slp_tree slp_node,
9929                         stmt_vector_for_cost *cost_vec)
9930 {
9931   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9932   unsigned ncopies;
9933   bool nested_in_vect_loop = false;
9934   class loop *iv_loop;
9935   tree vec_def;
9936   edge pe = loop_preheader_edge (loop);
9937   basic_block new_bb;
9938   tree new_vec, vec_init, vec_step, t;
9939   tree new_name;
9940   gimple *new_stmt;
9941   gphi *induction_phi;
9942   tree induc_def, vec_dest;
9943   tree init_expr, step_expr;
9944   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9945   unsigned i;
9946   tree expr;
9947   gimple_stmt_iterator si;
9948   enum vect_induction_op_type induction_type
9949     = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9950
9951   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9952   if (!phi)
9953     return false;
9954
9955   if (!STMT_VINFO_RELEVANT_P (stmt_info))
9956     return false;
9957
9958   /* Make sure it was recognized as induction computation.  */
9959   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
9960     return false;
9961
9962   /* Handle nonlinear induction in a separate place.  */
9963   if (induction_type != vect_step_op_add)
9964     return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
9965                                              vec_stmt, slp_node, cost_vec);
9966
9967   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9968   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9969
9970   if (slp_node)
9971     ncopies = 1;
9972   else
9973     ncopies = vect_get_num_copies (loop_vinfo, vectype);
9974   gcc_assert (ncopies >= 1);
9975
9976   /* FORNOW. These restrictions should be relaxed.  */
9977   if (nested_in_vect_loop_p (loop, stmt_info))
9978     {
9979       imm_use_iterator imm_iter;
9980       use_operand_p use_p;
9981       gimple *exit_phi;
9982       edge latch_e;
9983       tree loop_arg;
9984
9985       if (ncopies > 1)
9986         {
9987           if (dump_enabled_p ())
9988             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9989                              "multiple types in nested loop.\n");
9990           return false;
9991         }
9992
9993       exit_phi = NULL;
9994       latch_e = loop_latch_edge (loop->inner);
9995       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
9996       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
9997         {
9998           gimple *use_stmt = USE_STMT (use_p);
9999           if (is_gimple_debug (use_stmt))
10000             continue;
10001
10002           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
10003             {
10004               exit_phi = use_stmt;
10005               break;
10006             }
10007         }
10008       if (exit_phi)
10009         {
10010           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
10011           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
10012                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
10013             {
10014               if (dump_enabled_p ())
10015                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10016                                  "inner-loop induction only used outside "
10017                                  "of the outer vectorized loop.\n");
10018               return false;
10019             }
10020         }
10021
10022       nested_in_vect_loop = true;
10023       iv_loop = loop->inner;
10024     }
10025   else
10026     iv_loop = loop;
10027   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
10028
10029   if (slp_node && !nunits.is_constant ())
10030     {
10031       /* The current SLP code creates the step value element-by-element.  */
10032       if (dump_enabled_p ())
10033         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10034                          "SLP induction not supported for variable-length"
10035                          " vectors.\n");
10036       return false;
10037     }
10038
10039   if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
10040     {
10041       if (dump_enabled_p ())
10042         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10043                          "floating point induction vectorization disabled\n");
10044       return false;
10045     }
10046
10047   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
10048   gcc_assert (step_expr != NULL_TREE);
10049   if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
10050       && !type_has_mode_precision_p (TREE_TYPE (step_expr)))
10051     {
10052       if (dump_enabled_p ())
10053         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10054                          "bit-precision induction vectorization not "
10055                          "supported.\n");
10056       return false;
10057     }
10058   tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
10059
10060   /* Check for backend support of PLUS/MINUS_EXPR. */
10061   if (!directly_supported_p (PLUS_EXPR, step_vectype)
10062       || !directly_supported_p (MINUS_EXPR, step_vectype))
10063     return false;
10064
10065   if (!vec_stmt) /* transformation not required.  */
10066     {
10067       unsigned inside_cost = 0, prologue_cost = 0;
10068       if (slp_node)
10069         {
10070           /* We eventually need to set a vector type on invariant
10071              arguments.  */
10072           unsigned j;
10073           slp_tree child;
10074           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
10075             if (!vect_maybe_update_slp_op_vectype
10076                 (child, SLP_TREE_VECTYPE (slp_node)))
10077               {
10078                 if (dump_enabled_p ())
10079                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10080                                    "incompatible vector types for "
10081                                    "invariants\n");
10082                 return false;
10083               }
10084           /* loop cost for vec_loop.  */
10085           inside_cost
10086             = record_stmt_cost (cost_vec,
10087                                 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
10088                                 vector_stmt, stmt_info, 0, vect_body);
10089           /* prologue cost for vec_init (if not nested) and step.  */
10090           prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
10091                                             scalar_to_vec,
10092                                             stmt_info, 0, vect_prologue);
10093         }
10094       else /* if (!slp_node) */
10095         {
10096           /* loop cost for vec_loop.  */
10097           inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
10098                                           stmt_info, 0, vect_body);
10099           /* prologue cost for vec_init and vec_step.  */
10100           prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
10101                                             stmt_info, 0, vect_prologue);
10102         }
10103       if (dump_enabled_p ())
10104         dump_printf_loc (MSG_NOTE, vect_location,
10105                          "vect_model_induction_cost: inside_cost = %d, "
10106                          "prologue_cost = %d .\n", inside_cost,
10107                          prologue_cost);
10108
10109       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
10110       DUMP_VECT_SCOPE ("vectorizable_induction");
10111       return true;
10112     }
10113
10114   /* Transform.  */
10115
10116   /* Compute a vector variable, initialized with the first VF values of
10117      the induction variable.  E.g., for an iv with IV_PHI='X' and
10118      evolution S, for a vector of 4 units, we want to compute:
10119      [X, X + S, X + 2*S, X + 3*S].  */
10120
10121   if (dump_enabled_p ())
10122     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
10123
10124   pe = loop_preheader_edge (iv_loop);
10125   /* Find the first insertion point in the BB.  */
10126   basic_block bb = gimple_bb (phi);
10127   si = gsi_after_labels (bb);
10128
10129   /* For SLP induction we have to generate several IVs as for example
10130      with group size 3 we need
10131        [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
10132        [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2].  */
10133   if (slp_node)
10134     {
10135       /* Enforced above.  */
10136       unsigned int const_nunits = nunits.to_constant ();
10137
10138       /* The initial values are vectorized, but any lanes > group_size
10139          need adjustment.  */
10140       slp_tree init_node
10141         = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
10142
10143       /* Gather steps.  Since we do not vectorize inductions as
10144          cycles we have to reconstruct the step from SCEV data.  */
10145       unsigned group_size = SLP_TREE_LANES (slp_node);
10146       tree *steps = XALLOCAVEC (tree, group_size);
10147       tree *inits = XALLOCAVEC (tree, group_size);
10148       stmt_vec_info phi_info;
10149       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
10150         {
10151           steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
10152           if (!init_node)
10153             inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
10154                                            pe->dest_idx);
10155         }
10156
10157       /* Now generate the IVs.  */
10158       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10159       gcc_assert ((const_nunits * nvects) % group_size == 0);
10160       unsigned nivs;
10161       if (nested_in_vect_loop)
10162         nivs = nvects;
10163       else
10164         {
10165           /* Compute the number of distinct IVs we need.  First reduce
10166              group_size if it is a multiple of const_nunits so we get
10167              one IV for a group_size of 4 but const_nunits 2.  */
10168           unsigned group_sizep = group_size;
10169           if (group_sizep % const_nunits == 0)
10170             group_sizep = group_sizep / const_nunits;
10171           nivs = least_common_multiple (group_sizep,
10172                                         const_nunits) / const_nunits;
10173         }
10174       tree stept = TREE_TYPE (step_vectype);
10175       tree lupdate_mul = NULL_TREE;
10176       if (!nested_in_vect_loop)
10177         {
10178           /* The number of iterations covered in one vector iteration.  */
10179           unsigned lup_mul = (nvects * const_nunits) / group_size;
10180           lupdate_mul
10181             = build_vector_from_val (step_vectype,
10182                                      SCALAR_FLOAT_TYPE_P (stept)
10183                                      ? build_real_from_wide (stept, lup_mul,
10184                                                              UNSIGNED)
10185                                      : build_int_cstu (stept, lup_mul));
10186         }
10187       tree peel_mul = NULL_TREE;
10188       gimple_seq init_stmts = NULL;
10189       if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
10190         {
10191           if (SCALAR_FLOAT_TYPE_P (stept))
10192             peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
10193                                      LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
10194           else
10195             peel_mul = gimple_convert (&init_stmts, stept,
10196                                        LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
10197           peel_mul = gimple_build_vector_from_val (&init_stmts,
10198                                                    step_vectype, peel_mul);
10199         }
10200       unsigned ivn;
10201       auto_vec<tree> vec_steps;
10202       for (ivn = 0; ivn < nivs; ++ivn)
10203         {
10204           tree_vector_builder step_elts (step_vectype, const_nunits, 1);
10205           tree_vector_builder init_elts (vectype, const_nunits, 1);
10206           tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
10207           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
10208             {
10209               /* The scalar steps of the IVs.  */
10210               tree elt = steps[(ivn*const_nunits + eltn) % group_size];
10211               elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
10212               step_elts.quick_push (elt);
10213               if (!init_node)
10214                 {
10215                   /* The scalar inits of the IVs if not vectorized.  */
10216                   elt = inits[(ivn*const_nunits + eltn) % group_size];
10217                   if (!useless_type_conversion_p (TREE_TYPE (vectype),
10218                                                   TREE_TYPE (elt)))
10219                     elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
10220                                         TREE_TYPE (vectype), elt);
10221                   init_elts.quick_push (elt);
10222                 }
10223               /* The number of steps to add to the initial values.  */
10224               unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
10225               mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
10226                                    ? build_real_from_wide (stept,
10227                                                            mul_elt, UNSIGNED)
10228                                    : build_int_cstu (stept, mul_elt));
10229             }
10230           vec_step = gimple_build_vector (&init_stmts, &step_elts);
10231           vec_steps.safe_push (vec_step);
10232           tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
10233           if (peel_mul)
10234             step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
10235                                      step_mul, peel_mul);
10236           if (!init_node)
10237             vec_init = gimple_build_vector (&init_stmts, &init_elts);
10238
10239           /* Create the induction-phi that defines the induction-operand.  */
10240           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
10241                                             "vec_iv_");
10242           induction_phi = create_phi_node (vec_dest, iv_loop->header);
10243           induc_def = PHI_RESULT (induction_phi);
10244
10245           /* Create the iv update inside the loop  */
10246           tree up = vec_step;
10247           if (lupdate_mul)
10248             up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10249                                vec_step, lupdate_mul);
10250           gimple_seq stmts = NULL;
10251           vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10252           vec_def = gimple_build (&stmts,
10253                                   PLUS_EXPR, step_vectype, vec_def, up);
10254           vec_def = gimple_convert (&stmts, vectype, vec_def);
10255           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10256           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10257                        UNKNOWN_LOCATION);
10258
10259           if (init_node)
10260             vec_init = vect_get_slp_vect_def (init_node, ivn);
10261           if (!nested_in_vect_loop
10262               && !integer_zerop (step_mul))
10263             {
10264               vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
10265               up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10266                                  vec_step, step_mul);
10267               vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
10268                                       vec_def, up);
10269               vec_init = gimple_convert (&init_stmts, vectype, vec_def);
10270             }
10271
10272           /* Set the arguments of the phi node:  */
10273           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10274
10275           slp_node->push_vec_def (induction_phi);
10276         }
10277       if (!nested_in_vect_loop)
10278         {
10279           /* Fill up to the number of vectors we need for the whole group.  */
10280           nivs = least_common_multiple (group_size,
10281                                         const_nunits) / const_nunits;
10282           vec_steps.reserve (nivs-ivn);
10283           for (; ivn < nivs; ++ivn)
10284             {
10285               slp_node->push_vec_def (SLP_TREE_VEC_DEFS (slp_node)[0]);
10286               vec_steps.quick_push (vec_steps[0]);
10287             }
10288         }
10289
10290       /* Re-use IVs when we can.  We are generating further vector
10291          stmts by adding VF' * stride to the IVs generated above.  */
10292       if (ivn < nvects)
10293         {
10294           unsigned vfp
10295             = least_common_multiple (group_size, const_nunits) / group_size;
10296           tree lupdate_mul
10297             = build_vector_from_val (step_vectype,
10298                                      SCALAR_FLOAT_TYPE_P (stept)
10299                                      ? build_real_from_wide (stept,
10300                                                              vfp, UNSIGNED)
10301                                      : build_int_cstu (stept, vfp));
10302           for (; ivn < nvects; ++ivn)
10303             {
10304               gimple *iv
10305                 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (slp_node)[ivn - nivs]);
10306               tree def = gimple_get_lhs (iv);
10307               if (ivn < 2*nivs)
10308                 vec_steps[ivn - nivs]
10309                   = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10310                                   vec_steps[ivn - nivs], lupdate_mul);
10311               gimple_seq stmts = NULL;
10312               def = gimple_convert (&stmts, step_vectype, def);
10313               def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10314                                   def, vec_steps[ivn % nivs]);
10315               def = gimple_convert (&stmts, vectype, def);
10316               if (gimple_code (iv) == GIMPLE_PHI)
10317                 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10318               else
10319                 {
10320                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
10321                   gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
10322                 }
10323               slp_node->push_vec_def (def);
10324             }
10325         }
10326
10327       new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
10328       gcc_assert (!new_bb);
10329
10330       return true;
10331     }
10332
10333   init_expr = vect_phi_initial_value (phi);
10334
10335   gimple_seq stmts = NULL;
10336   if (!nested_in_vect_loop)
10337     {
10338       /* Convert the initial value to the IV update type.  */
10339       tree new_type = TREE_TYPE (step_expr);
10340       init_expr = gimple_convert (&stmts, new_type, init_expr);
10341
10342       /* If we are using the loop mask to "peel" for alignment then we need
10343          to adjust the start value here.  */
10344       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
10345       if (skip_niters != NULL_TREE)
10346         {
10347           if (FLOAT_TYPE_P (vectype))
10348             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
10349                                         skip_niters);
10350           else
10351             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
10352           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
10353                                          skip_niters, step_expr);
10354           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
10355                                     init_expr, skip_step);
10356         }
10357     }
10358
10359   if (stmts)
10360     {
10361       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10362       gcc_assert (!new_bb);
10363     }
10364
10365   /* Create the vector that holds the initial_value of the induction.  */
10366   if (nested_in_vect_loop)
10367     {
10368       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
10369          been created during vectorization of previous stmts.  We obtain it
10370          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
10371       auto_vec<tree> vec_inits;
10372       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
10373                                      init_expr, &vec_inits);
10374       vec_init = vec_inits[0];
10375       /* If the initial value is not of proper type, convert it.  */
10376       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
10377         {
10378           new_stmt
10379             = gimple_build_assign (vect_get_new_ssa_name (vectype,
10380                                                           vect_simple_var,
10381                                                           "vec_iv_"),
10382                                    VIEW_CONVERT_EXPR,
10383                                    build1 (VIEW_CONVERT_EXPR, vectype,
10384                                            vec_init));
10385           vec_init = gimple_assign_lhs (new_stmt);
10386           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
10387                                                  new_stmt);
10388           gcc_assert (!new_bb);
10389         }
10390     }
10391   else
10392     {
10393       /* iv_loop is the loop to be vectorized. Create:
10394          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
10395       stmts = NULL;
10396       new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
10397
10398       unsigned HOST_WIDE_INT const_nunits;
10399       if (nunits.is_constant (&const_nunits))
10400         {
10401           tree_vector_builder elts (step_vectype, const_nunits, 1);
10402           elts.quick_push (new_name);
10403           for (i = 1; i < const_nunits; i++)
10404             {
10405               /* Create: new_name_i = new_name + step_expr  */
10406               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
10407                                        new_name, step_expr);
10408               elts.quick_push (new_name);
10409             }
10410           /* Create a vector from [new_name_0, new_name_1, ...,
10411              new_name_nunits-1]  */
10412           vec_init = gimple_build_vector (&stmts, &elts);
10413         }
10414       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
10415         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
10416         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
10417                                  new_name, step_expr);
10418       else
10419         {
10420           /* Build:
10421                 [base, base, base, ...]
10422                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
10423           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
10424           gcc_assert (flag_associative_math);
10425           tree index = build_index_vector (step_vectype, 0, 1);
10426           tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10427                                                         new_name);
10428           tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10429                                                         step_expr);
10430           vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
10431           vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
10432                                    vec_init, step_vec);
10433           vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10434                                    vec_init, base_vec);
10435         }
10436       vec_init = gimple_convert (&stmts, vectype, vec_init);
10437
10438       if (stmts)
10439         {
10440           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10441           gcc_assert (!new_bb);
10442         }
10443     }
10444
10445
10446   /* Create the vector that holds the step of the induction.  */
10447   gimple_stmt_iterator *step_iv_si = NULL;
10448   if (nested_in_vect_loop)
10449     /* iv_loop is nested in the loop to be vectorized. Generate:
10450        vec_step = [S, S, S, S]  */
10451     new_name = step_expr;
10452   else if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
10453     {
10454       /* When we're using loop_len produced by SELEC_VL, the non-final
10455          iterations are not always processing VF elements.  So vectorize
10456          induction variable instead of
10457
10458            _21 = vect_vec_iv_.6_22 + { VF, ... };
10459
10460          We should generate:
10461
10462            _35 = .SELECT_VL (ivtmp_33, VF);
10463            vect_cst__22 = [vec_duplicate_expr] _35;
10464            _21 = vect_vec_iv_.6_22 + vect_cst__22;  */
10465       gcc_assert (!slp_node);
10466       gimple_seq seq = NULL;
10467       vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
10468       tree len = vect_get_loop_len (loop_vinfo, NULL, lens, 1, vectype, 0, 0);
10469       expr = force_gimple_operand (fold_convert (TREE_TYPE (step_expr),
10470                                                  unshare_expr (len)),
10471                                    &seq, true, NULL_TREE);
10472       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr), expr,
10473                                step_expr);
10474       gsi_insert_seq_before (&si, seq, GSI_SAME_STMT);
10475       step_iv_si = &si;
10476     }
10477   else
10478     {
10479       /* iv_loop is the loop to be vectorized. Generate:
10480           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
10481       gimple_seq seq = NULL;
10482       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10483         {
10484           expr = build_int_cst (integer_type_node, vf);
10485           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10486         }
10487       else
10488         expr = build_int_cst (TREE_TYPE (step_expr), vf);
10489       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10490                                expr, step_expr);
10491       if (seq)
10492         {
10493           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10494           gcc_assert (!new_bb);
10495         }
10496     }
10497
10498   t = unshare_expr (new_name);
10499   gcc_assert (CONSTANT_CLASS_P (new_name)
10500               || TREE_CODE (new_name) == SSA_NAME);
10501   new_vec = build_vector_from_val (step_vectype, t);
10502   vec_step = vect_init_vector (loop_vinfo, stmt_info,
10503                                new_vec, step_vectype, step_iv_si);
10504
10505
10506   /* Create the following def-use cycle:
10507      loop prolog:
10508          vec_init = ...
10509          vec_step = ...
10510      loop:
10511          vec_iv = PHI <vec_init, vec_loop>
10512          ...
10513          STMT
10514          ...
10515          vec_loop = vec_iv + vec_step;  */
10516
10517   /* Create the induction-phi that defines the induction-operand.  */
10518   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
10519   induction_phi = create_phi_node (vec_dest, iv_loop->header);
10520   induc_def = PHI_RESULT (induction_phi);
10521
10522   /* Create the iv update inside the loop  */
10523   stmts = NULL;
10524   vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10525   vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
10526   vec_def = gimple_convert (&stmts, vectype, vec_def);
10527   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10528   new_stmt = SSA_NAME_DEF_STMT (vec_def);
10529
10530   /* Set the arguments of the phi node:  */
10531   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10532   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10533                UNKNOWN_LOCATION);
10534
10535   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
10536   *vec_stmt = induction_phi;
10537
10538   /* In case that vectorization factor (VF) is bigger than the number
10539      of elements that we can fit in a vectype (nunits), we have to generate
10540      more than one vector stmt - i.e - we need to "unroll" the
10541      vector stmt by a factor VF/nunits.  For more details see documentation
10542      in vectorizable_operation.  */
10543
10544   if (ncopies > 1)
10545     {
10546       gimple_seq seq = NULL;
10547       /* FORNOW. This restriction should be relaxed.  */
10548       gcc_assert (!nested_in_vect_loop);
10549       /* We expect LOOP_VINFO_USING_SELECT_VL_P to be false if ncopies > 1.  */
10550       gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
10551
10552       /* Create the vector that holds the step of the induction.  */
10553       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10554         {
10555           expr = build_int_cst (integer_type_node, nunits);
10556           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10557         }
10558       else
10559         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
10560       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10561                                expr, step_expr);
10562       if (seq)
10563         {
10564           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10565           gcc_assert (!new_bb);
10566         }
10567
10568       t = unshare_expr (new_name);
10569       gcc_assert (CONSTANT_CLASS_P (new_name)
10570                   || TREE_CODE (new_name) == SSA_NAME);
10571       new_vec = build_vector_from_val (step_vectype, t);
10572       vec_step = vect_init_vector (loop_vinfo, stmt_info,
10573                                    new_vec, step_vectype, NULL);
10574
10575       vec_def = induc_def;
10576       for (i = 1; i < ncopies + 1; i++)
10577         {
10578           /* vec_i = vec_prev + vec_step  */
10579           gimple_seq stmts = NULL;
10580           vec_def = gimple_convert (&stmts, step_vectype, vec_def);
10581           vec_def = gimple_build (&stmts,
10582                                   PLUS_EXPR, step_vectype, vec_def, vec_step);
10583           vec_def = gimple_convert (&stmts, vectype, vec_def);
10584
10585           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10586           if (i < ncopies)
10587             {
10588               new_stmt = SSA_NAME_DEF_STMT (vec_def);
10589               STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
10590             }
10591           else
10592             {
10593               /* vec_1 = vec_iv + (VF/n * S)
10594                  vec_2 = vec_1 + (VF/n * S)
10595                  ...
10596                  vec_n = vec_prev + (VF/n * S) = vec_iv + VF * S = vec_loop
10597
10598                  vec_n is used as vec_loop to save the large step register and
10599                  related operations.  */
10600               add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10601                            UNKNOWN_LOCATION);
10602             }
10603         }
10604     }
10605
10606   if (dump_enabled_p ())
10607     dump_printf_loc (MSG_NOTE, vect_location,
10608                      "transform induction: created def-use cycle: %G%G",
10609                      (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
10610
10611   return true;
10612 }
10613
10614 /* Function vectorizable_live_operation_1.
10615
10616    helper function for vectorizable_live_operation.  */
10617
10618 static tree
10619 vectorizable_live_operation_1 (loop_vec_info loop_vinfo,
10620                                stmt_vec_info stmt_info, basic_block exit_bb,
10621                                tree vectype, int ncopies, slp_tree slp_node,
10622                                tree bitsize, tree bitstart, tree vec_lhs,
10623                                tree lhs_type, gimple_stmt_iterator *exit_gsi)
10624 {
10625   gcc_assert (single_pred_p (exit_bb) || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
10626
10627   tree vec_lhs_phi = copy_ssa_name (vec_lhs);
10628   gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
10629   for (unsigned i = 0; i < gimple_phi_num_args (phi); i++)
10630     SET_PHI_ARG_DEF (phi, i, vec_lhs);
10631
10632   gimple_seq stmts = NULL;
10633   tree new_tree;
10634
10635   /* If bitstart is 0 then we can use a BIT_FIELD_REF  */
10636   if (integer_zerop (bitstart))
10637     {
10638       tree scalar_res = gimple_build (&stmts, BIT_FIELD_REF, TREE_TYPE (vectype),
10639                                       vec_lhs_phi, bitsize, bitstart);
10640
10641       /* Convert the extracted vector element to the scalar type.  */
10642       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10643     }
10644   else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
10645     {
10646       /* Emit:
10647
10648          SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
10649
10650          where VEC_LHS is the vectorized live-out result and MASK is
10651          the loop mask for the final iteration.  */
10652       gcc_assert (ncopies == 1 && !slp_node);
10653       gimple_seq tem = NULL;
10654       gimple_stmt_iterator gsi = gsi_last (tem);
10655       tree len = vect_get_loop_len (loop_vinfo, &gsi,
10656                                     &LOOP_VINFO_LENS (loop_vinfo),
10657                                     1, vectype, 0, 0);
10658
10659       /* BIAS - 1.  */
10660       signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
10661       tree bias_minus_one
10662         = int_const_binop (MINUS_EXPR,
10663                            build_int_cst (TREE_TYPE (len), biasval),
10664                            build_one_cst (TREE_TYPE (len)));
10665
10666       /* LAST_INDEX = LEN + (BIAS - 1).  */
10667       tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
10668                                      len, bias_minus_one);
10669
10670       /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>.  */
10671       tree scalar_res
10672         = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
10673                         vec_lhs_phi, last_index);
10674
10675       /* Convert the extracted vector element to the scalar type.  */
10676       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10677     }
10678   else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
10679     {
10680       /* Emit:
10681
10682          SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
10683
10684          where VEC_LHS is the vectorized live-out result and MASK is
10685          the loop mask for the final iteration.  */
10686       gcc_assert (!slp_node);
10687       tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
10688       gimple_seq tem = NULL;
10689       gimple_stmt_iterator gsi = gsi_last (tem);
10690       tree mask = vect_get_loop_mask (loop_vinfo, &gsi,
10691                                       &LOOP_VINFO_MASKS (loop_vinfo),
10692                                       1, vectype, 0);
10693       tree scalar_res;
10694       gimple_seq_add_seq (&stmts, tem);
10695
10696       scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
10697                                  mask, vec_lhs_phi);
10698
10699       /* Convert the extracted vector element to the scalar type.  */
10700       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10701     }
10702   else
10703     {
10704       tree bftype = TREE_TYPE (vectype);
10705       if (VECTOR_BOOLEAN_TYPE_P (vectype))
10706         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10707       new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs_phi, bitsize, bitstart);
10708       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10709                                        &stmts, true, NULL_TREE);
10710     }
10711
10712   *exit_gsi = gsi_after_labels (exit_bb);
10713   if (stmts)
10714     gsi_insert_seq_before (exit_gsi, stmts, GSI_SAME_STMT);
10715
10716   return new_tree;
10717 }
10718
10719 /* Function vectorizable_live_operation.
10720
10721    STMT_INFO computes a value that is used outside the loop.  Check if
10722    it can be supported.  */
10723
10724 bool
10725 vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
10726                              slp_tree slp_node, slp_instance slp_node_instance,
10727                              int slp_index, bool vec_stmt_p,
10728                              stmt_vector_for_cost *cost_vec)
10729 {
10730   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
10731   imm_use_iterator imm_iter;
10732   tree lhs, lhs_type, bitsize;
10733   tree vectype = (slp_node
10734                   ? SLP_TREE_VECTYPE (slp_node)
10735                   : STMT_VINFO_VECTYPE (stmt_info));
10736   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10737   int ncopies;
10738   gimple *use_stmt;
10739   use_operand_p use_p;
10740   auto_vec<tree> vec_oprnds;
10741   int vec_entry = 0;
10742   poly_uint64 vec_index = 0;
10743
10744   gcc_assert (STMT_VINFO_LIVE_P (stmt_info)
10745               || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
10746
10747   /* If a stmt of a reduction is live, vectorize it via
10748      vect_create_epilog_for_reduction.  vectorizable_reduction assessed
10749      validity so just trigger the transform here.  */
10750   if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
10751     {
10752       if (!vec_stmt_p)
10753         return true;
10754       if (slp_node)
10755         {
10756           /* For reduction chains the meta-info is attached to
10757              the group leader.  */
10758           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
10759             stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
10760           /* For SLP reductions we vectorize the epilogue for
10761              all involved stmts together.  */
10762           else if (slp_index != 0)
10763             return true;
10764         }
10765       stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
10766       gcc_assert (reduc_info->is_reduc_info);
10767       if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
10768           || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
10769         return true;
10770
10771       vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
10772                                         slp_node_instance,
10773                                         LOOP_VINFO_IV_EXIT (loop_vinfo));
10774
10775       /* If early break we only have to materialize the reduction on the merge
10776          block, but we have to find an alternate exit first.  */
10777       if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
10778         {
10779           for (auto exit : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
10780             if (exit != LOOP_VINFO_IV_EXIT (loop_vinfo))
10781               {
10782                 vect_create_epilog_for_reduction (loop_vinfo, stmt_info,
10783                                                   slp_node, slp_node_instance,
10784                                                   exit);
10785                 break;
10786               }
10787         }
10788
10789       return true;
10790     }
10791
10792   /* If STMT is not relevant and it is a simple assignment and its inputs are
10793      invariant then it can remain in place, unvectorized.  The original last
10794      scalar value that it computes will be used.  */
10795   if (!STMT_VINFO_RELEVANT_P (stmt_info))
10796     {
10797       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
10798       if (dump_enabled_p ())
10799         dump_printf_loc (MSG_NOTE, vect_location,
10800                          "statement is simple and uses invariant.  Leaving in "
10801                          "place.\n");
10802       return true;
10803     }
10804
10805   if (slp_node)
10806     ncopies = 1;
10807   else
10808     ncopies = vect_get_num_copies (loop_vinfo, vectype);
10809
10810   if (slp_node)
10811     {
10812       gcc_assert (slp_index >= 0);
10813
10814       /* Get the last occurrence of the scalar index from the concatenation of
10815          all the slp vectors. Calculate which slp vector it is and the index
10816          within.  */
10817       int num_scalar = SLP_TREE_LANES (slp_node);
10818       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10819       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
10820
10821       /* Calculate which vector contains the result, and which lane of
10822          that vector we need.  */
10823       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
10824         {
10825           if (dump_enabled_p ())
10826             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10827                              "Cannot determine which vector holds the"
10828                              " final result.\n");
10829           return false;
10830         }
10831     }
10832
10833   if (!vec_stmt_p)
10834     {
10835       /* No transformation required.  */
10836       if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
10837         {
10838           if (slp_node)
10839             {
10840               if (dump_enabled_p ())
10841                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10842                                  "can't operate on partial vectors "
10843                                  "because an SLP statement is live after "
10844                                  "the loop.\n");
10845               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10846             }
10847           else if (ncopies > 1)
10848             {
10849               if (dump_enabled_p ())
10850                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10851                                  "can't operate on partial vectors "
10852                                  "because ncopies is greater than 1.\n");
10853               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10854             }
10855           else
10856             {
10857               gcc_assert (ncopies == 1 && !slp_node);
10858               if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
10859                                                   OPTIMIZE_FOR_SPEED))
10860                 vect_record_loop_mask (loop_vinfo,
10861                                        &LOOP_VINFO_MASKS (loop_vinfo),
10862                                        1, vectype, NULL);
10863               else if (can_vec_extract_var_idx_p (
10864                          TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
10865                 vect_record_loop_len (loop_vinfo,
10866                                       &LOOP_VINFO_LENS (loop_vinfo),
10867                                       1, vectype, 1);
10868               else
10869                 {
10870                   if (dump_enabled_p ())
10871                     dump_printf_loc (
10872                       MSG_MISSED_OPTIMIZATION, vect_location,
10873                       "can't operate on partial vectors "
10874                       "because the target doesn't support extract "
10875                       "last reduction.\n");
10876                   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10877                 }
10878             }
10879         }
10880       /* ???  Enable for loop costing as well.  */
10881       if (!loop_vinfo)
10882         record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
10883                           0, vect_epilogue);
10884       return true;
10885     }
10886
10887   /* Use the lhs of the original scalar statement.  */
10888   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
10889   if (dump_enabled_p ())
10890     dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
10891                      "stmt %G", stmt);
10892
10893   lhs = gimple_get_lhs (stmt);
10894   lhs_type = TREE_TYPE (lhs);
10895
10896   bitsize = vector_element_bits_tree (vectype);
10897
10898   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
10899   tree vec_lhs, vec_lhs0, bitstart;
10900   gimple *vec_stmt, *vec_stmt0;
10901   if (slp_node)
10902     {
10903       gcc_assert (!loop_vinfo
10904                   || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
10905                       && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)));
10906
10907       /* Get the correct slp vectorized stmt.  */
10908       vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
10909       vec_stmt = SSA_NAME_DEF_STMT (vec_lhs);
10910
10911       /* In case we need to early break vectorize also get the first stmt.  */
10912       vec_lhs0 = SLP_TREE_VEC_DEFS (slp_node)[0];
10913       vec_stmt0 = SSA_NAME_DEF_STMT (vec_lhs0);
10914
10915       /* Get entry to use.  */
10916       bitstart = bitsize_int (vec_index);
10917       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
10918     }
10919   else
10920     {
10921       /* For multiple copies, get the last copy.  */
10922       vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
10923       vec_lhs = gimple_get_lhs (vec_stmt);
10924
10925       /* In case we need to early break vectorize also get the first stmt.  */
10926       vec_stmt0 = STMT_VINFO_VEC_STMTS (stmt_info)[0];
10927       vec_lhs0 = gimple_get_lhs (vec_stmt0);
10928
10929       /* Get the last lane in the vector.  */
10930       bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
10931     }
10932
10933   if (loop_vinfo)
10934     {
10935       /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
10936          requirement, insert one phi node for it.  It looks like:
10937            loop;
10938          BB:
10939            # lhs' = PHI <lhs>
10940          ==>
10941            loop;
10942          BB:
10943            # vec_lhs' = PHI <vec_lhs>
10944            new_tree = lane_extract <vec_lhs', ...>;
10945            lhs' = new_tree;  */
10946
10947       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10948       /* Check if we have a loop where the chosen exit is not the main exit,
10949          in these cases for an early break we restart the iteration the vector code
10950          did.  For the live values we want the value at the start of the iteration
10951          rather than at the end.  */
10952       edge main_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
10953       bool restart_loop = LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo);
10954       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10955         if (!is_gimple_debug (use_stmt)
10956             && !flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
10957           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10958             {
10959               edge e = gimple_phi_arg_edge (as_a <gphi *> (use_stmt),
10960                                            phi_arg_index_from_use (use_p));
10961               gcc_assert (loop_exit_edge_p (loop, e));
10962               bool main_exit_edge = e == main_e;
10963               tree tmp_vec_lhs = vec_lhs;
10964               tree tmp_bitstart = bitstart;
10965
10966               /* For early exit where the exit is not in the BB that leads
10967                  to the latch then we're restarting the iteration in the
10968                  scalar loop.  So get the first live value.  */
10969               restart_loop = restart_loop || !main_exit_edge;
10970               if (restart_loop
10971                   && STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
10972                 {
10973                   tmp_vec_lhs = vec_lhs0;
10974                   tmp_bitstart = build_zero_cst (TREE_TYPE (bitstart));
10975                 }
10976
10977               gimple_stmt_iterator exit_gsi;
10978               tree new_tree
10979                 = vectorizable_live_operation_1 (loop_vinfo, stmt_info,
10980                                                  e->dest, vectype, ncopies,
10981                                                  slp_node, bitsize,
10982                                                  tmp_bitstart, tmp_vec_lhs,
10983                                                  lhs_type, &exit_gsi);
10984
10985               auto gsi = gsi_for_stmt (use_stmt);
10986               remove_phi_node (&gsi, false);
10987               tree lhs_phi = gimple_phi_result (use_stmt);
10988               gimple *copy = gimple_build_assign (lhs_phi, new_tree);
10989               gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
10990               break;
10991             }
10992
10993       /* There a no further out-of-loop uses of lhs by LC-SSA construction.  */
10994       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10995         gcc_assert (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)));
10996     }
10997   else
10998     {
10999       /* For basic-block vectorization simply insert the lane-extraction.  */
11000       tree bftype = TREE_TYPE (vectype);
11001       if (VECTOR_BOOLEAN_TYPE_P (vectype))
11002         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
11003       tree new_tree = build3 (BIT_FIELD_REF, bftype,
11004                               vec_lhs, bitsize, bitstart);
11005       gimple_seq stmts = NULL;
11006       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
11007                                        &stmts, true, NULL_TREE);
11008       if (TREE_CODE (new_tree) == SSA_NAME
11009           && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
11010         SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
11011       if (is_a <gphi *> (vec_stmt))
11012         {
11013           gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
11014           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
11015         }
11016       else
11017         {
11018           gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
11019           gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
11020         }
11021
11022       /* Replace use of lhs with newly computed result.  If the use stmt is a
11023          single arg PHI, just replace all uses of PHI result.  It's necessary
11024          because lcssa PHI defining lhs may be before newly inserted stmt.  */
11025       use_operand_p use_p;
11026       stmt_vec_info use_stmt_info;
11027       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
11028         if (!is_gimple_debug (use_stmt)
11029             && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
11030                 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
11031           {
11032             /* ???  This can happen when the live lane ends up being
11033                rooted in a vector construction code-generated by an
11034                external SLP node (and code-generation for that already
11035                happened).  See gcc.dg/vect/bb-slp-47.c.
11036                Doing this is what would happen if that vector CTOR
11037                were not code-generated yet so it is not too bad.
11038                ???  In fact we'd likely want to avoid this situation
11039                in the first place.  */
11040             if (TREE_CODE (new_tree) == SSA_NAME
11041                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
11042                 && gimple_code (use_stmt) != GIMPLE_PHI
11043                 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
11044                                                 use_stmt))
11045               {
11046                 if (dump_enabled_p ())
11047                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11048                                    "Using original scalar computation for "
11049                                    "live lane because use preceeds vector "
11050                                    "def\n");
11051                 continue;
11052               }
11053             /* ???  It can also happen that we end up pulling a def into
11054                a loop where replacing out-of-loop uses would require
11055                a new LC SSA PHI node.  Retain the original scalar in
11056                those cases as well.  PR98064.  */
11057             if (TREE_CODE (new_tree) == SSA_NAME
11058                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
11059                 && (gimple_bb (use_stmt)->loop_father
11060                     != gimple_bb (vec_stmt)->loop_father)
11061                 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
11062                                         gimple_bb (use_stmt)->loop_father))
11063               {
11064                 if (dump_enabled_p ())
11065                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11066                                    "Using original scalar computation for "
11067                                    "live lane because there is an out-of-loop "
11068                                    "definition for it\n");
11069                 continue;
11070               }
11071             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
11072               SET_USE (use_p, new_tree);
11073             update_stmt (use_stmt);
11074           }
11075     }
11076
11077   return true;
11078 }
11079
11080 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
11081
11082 static void
11083 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
11084 {
11085   ssa_op_iter op_iter;
11086   imm_use_iterator imm_iter;
11087   def_operand_p def_p;
11088   gimple *ustmt;
11089
11090   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
11091     {
11092       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
11093         {
11094           basic_block bb;
11095
11096           if (!is_gimple_debug (ustmt))
11097             continue;
11098
11099           bb = gimple_bb (ustmt);
11100
11101           if (!flow_bb_inside_loop_p (loop, bb))
11102             {
11103               if (gimple_debug_bind_p (ustmt))
11104                 {
11105                   if (dump_enabled_p ())
11106                     dump_printf_loc (MSG_NOTE, vect_location,
11107                                      "killing debug use\n");
11108
11109                   gimple_debug_bind_reset_value (ustmt);
11110                   update_stmt (ustmt);
11111                 }
11112               else
11113                 gcc_unreachable ();
11114             }
11115         }
11116     }
11117 }
11118
11119 /* Given loop represented by LOOP_VINFO, return true if computation of
11120    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
11121    otherwise.  */
11122
11123 static bool
11124 loop_niters_no_overflow (loop_vec_info loop_vinfo)
11125 {
11126   /* Constant case.  */
11127   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
11128     {
11129       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
11130       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
11131
11132       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
11133       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
11134       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
11135         return true;
11136     }
11137
11138   widest_int max;
11139   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11140   /* Check the upper bound of loop niters.  */
11141   if (get_max_loop_iterations (loop, &max))
11142     {
11143       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
11144       signop sgn = TYPE_SIGN (type);
11145       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
11146       if (max < type_max)
11147         return true;
11148     }
11149   return false;
11150 }
11151
11152 /* Return a mask type with half the number of elements as OLD_TYPE,
11153    given that it should have mode NEW_MODE.  */
11154
11155 tree
11156 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
11157 {
11158   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
11159   return build_truth_vector_type_for_mode (nunits, new_mode);
11160 }
11161
11162 /* Return a mask type with twice as many elements as OLD_TYPE,
11163    given that it should have mode NEW_MODE.  */
11164
11165 tree
11166 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
11167 {
11168   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
11169   return build_truth_vector_type_for_mode (nunits, new_mode);
11170 }
11171
11172 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
11173    contain a sequence of NVECTORS masks that each control a vector of type
11174    VECTYPE.  If SCALAR_MASK is nonnull, the fully-masked loop would AND
11175    these vector masks with the vector version of SCALAR_MASK.  */
11176
11177 void
11178 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
11179                        unsigned int nvectors, tree vectype, tree scalar_mask)
11180 {
11181   gcc_assert (nvectors != 0);
11182
11183   if (scalar_mask)
11184     {
11185       scalar_cond_masked_key cond (scalar_mask, nvectors);
11186       loop_vinfo->scalar_cond_masked_set.add (cond);
11187     }
11188
11189   masks->mask_set.add (std::make_pair (vectype, nvectors));
11190 }
11191
11192 /* Given a complete set of masks MASKS, extract mask number INDEX
11193    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
11194    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
11195
11196    See the comment above vec_loop_masks for more details about the mask
11197    arrangement.  */
11198
11199 tree
11200 vect_get_loop_mask (loop_vec_info loop_vinfo,
11201                     gimple_stmt_iterator *gsi, vec_loop_masks *masks,
11202                     unsigned int nvectors, tree vectype, unsigned int index)
11203 {
11204   if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
11205       == vect_partial_vectors_while_ult)
11206     {
11207       rgroup_controls *rgm = &(masks->rgc_vec)[nvectors - 1];
11208       tree mask_type = rgm->type;
11209
11210       /* Populate the rgroup's mask array, if this is the first time we've
11211          used it.  */
11212       if (rgm->controls.is_empty ())
11213         {
11214           rgm->controls.safe_grow_cleared (nvectors, true);
11215           for (unsigned int i = 0; i < nvectors; ++i)
11216             {
11217               tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
11218               /* Provide a dummy definition until the real one is available.  */
11219               SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
11220               rgm->controls[i] = mask;
11221             }
11222         }
11223
11224       tree mask = rgm->controls[index];
11225       if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
11226                     TYPE_VECTOR_SUBPARTS (vectype)))
11227         {
11228           /* A loop mask for data type X can be reused for data type Y
11229              if X has N times more elements than Y and if Y's elements
11230              are N times bigger than X's.  In this case each sequence
11231              of N elements in the loop mask will be all-zero or all-one.
11232              We can then view-convert the mask so that each sequence of
11233              N elements is replaced by a single element.  */
11234           gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
11235                                   TYPE_VECTOR_SUBPARTS (vectype)));
11236           gimple_seq seq = NULL;
11237           mask_type = truth_type_for (vectype);
11238           mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
11239           if (seq)
11240             gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11241         }
11242       return mask;
11243     }
11244   else if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
11245            == vect_partial_vectors_avx512)
11246     {
11247       /* The number of scalars per iteration and the number of vectors are
11248          both compile-time constants.  */
11249       unsigned int nscalars_per_iter
11250         = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11251                      LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
11252
11253       rgroup_controls *rgm = &masks->rgc_vec[nscalars_per_iter - 1];
11254
11255       /* The stored nV is dependent on the mask type produced.  */
11256       gcc_assert (exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11257                              TYPE_VECTOR_SUBPARTS (rgm->type)).to_constant ()
11258                   == rgm->factor);
11259       nvectors = rgm->factor;
11260
11261       /* Populate the rgroup's mask array, if this is the first time we've
11262          used it.  */
11263       if (rgm->controls.is_empty ())
11264         {
11265           rgm->controls.safe_grow_cleared (nvectors, true);
11266           for (unsigned int i = 0; i < nvectors; ++i)
11267             {
11268               tree mask = make_temp_ssa_name (rgm->type, NULL, "loop_mask");
11269               /* Provide a dummy definition until the real one is available.  */
11270               SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
11271               rgm->controls[i] = mask;
11272             }
11273         }
11274       if (known_eq (TYPE_VECTOR_SUBPARTS (rgm->type),
11275                     TYPE_VECTOR_SUBPARTS (vectype)))
11276         return rgm->controls[index];
11277
11278       /* Split the vector if needed.  Since we are dealing with integer mode
11279          masks with AVX512 we can operate on the integer representation
11280          performing the whole vector shifting.  */
11281       unsigned HOST_WIDE_INT factor;
11282       bool ok = constant_multiple_p (TYPE_VECTOR_SUBPARTS (rgm->type),
11283                                      TYPE_VECTOR_SUBPARTS (vectype), &factor);
11284       gcc_assert (ok);
11285       gcc_assert (GET_MODE_CLASS (TYPE_MODE (rgm->type)) == MODE_INT);
11286       tree mask_type = truth_type_for (vectype);
11287       gcc_assert (GET_MODE_CLASS (TYPE_MODE (mask_type)) == MODE_INT);
11288       unsigned vi = index / factor;
11289       unsigned vpart = index % factor;
11290       tree vec = rgm->controls[vi];
11291       gimple_seq seq = NULL;
11292       vec = gimple_build (&seq, VIEW_CONVERT_EXPR,
11293                           lang_hooks.types.type_for_mode
11294                                 (TYPE_MODE (rgm->type), 1), vec);
11295       /* For integer mode masks simply shift the right bits into position.  */
11296       if (vpart != 0)
11297         vec = gimple_build (&seq, RSHIFT_EXPR, TREE_TYPE (vec), vec,
11298                             build_int_cst (integer_type_node,
11299                                            (TYPE_VECTOR_SUBPARTS (vectype)
11300                                             * vpart)));
11301       vec = gimple_convert (&seq, lang_hooks.types.type_for_mode
11302                                     (TYPE_MODE (mask_type), 1), vec);
11303       vec = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, vec);
11304       if (seq)
11305         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11306       return vec;
11307     }
11308   else
11309     gcc_unreachable ();
11310 }
11311
11312 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
11313    lengths for controlling an operation on VECTYPE.  The operation splits
11314    each element of VECTYPE into FACTOR separate subelements, measuring the
11315    length as a number of these subelements.  */
11316
11317 void
11318 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
11319                       unsigned int nvectors, tree vectype, unsigned int factor)
11320 {
11321   gcc_assert (nvectors != 0);
11322   if (lens->length () < nvectors)
11323     lens->safe_grow_cleared (nvectors, true);
11324   rgroup_controls *rgl = &(*lens)[nvectors - 1];
11325
11326   /* The number of scalars per iteration, scalar occupied bytes and
11327      the number of vectors are both compile-time constants.  */
11328   unsigned int nscalars_per_iter
11329     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11330                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
11331
11332   if (rgl->max_nscalars_per_iter < nscalars_per_iter)
11333     {
11334       /* For now, we only support cases in which all loads and stores fall back
11335          to VnQI or none do.  */
11336       gcc_assert (!rgl->max_nscalars_per_iter
11337                   || (rgl->factor == 1 && factor == 1)
11338                   || (rgl->max_nscalars_per_iter * rgl->factor
11339                       == nscalars_per_iter * factor));
11340       rgl->max_nscalars_per_iter = nscalars_per_iter;
11341       rgl->type = vectype;
11342       rgl->factor = factor;
11343     }
11344 }
11345
11346 /* Given a complete set of lengths LENS, extract length number INDEX
11347    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
11348    where 0 <= INDEX < NVECTORS.  Return a value that contains FACTOR
11349    multipled by the number of elements that should be processed.
11350    Insert any set-up statements before GSI.  */
11351
11352 tree
11353 vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
11354                    vec_loop_lens *lens, unsigned int nvectors, tree vectype,
11355                    unsigned int index, unsigned int factor)
11356 {
11357   rgroup_controls *rgl = &(*lens)[nvectors - 1];
11358   bool use_bias_adjusted_len =
11359     LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
11360
11361   /* Populate the rgroup's len array, if this is the first time we've
11362      used it.  */
11363   if (rgl->controls.is_empty ())
11364     {
11365       rgl->controls.safe_grow_cleared (nvectors, true);
11366       for (unsigned int i = 0; i < nvectors; ++i)
11367         {
11368           tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
11369           gcc_assert (len_type != NULL_TREE);
11370
11371           tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
11372
11373           /* Provide a dummy definition until the real one is available.  */
11374           SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
11375           rgl->controls[i] = len;
11376
11377           if (use_bias_adjusted_len)
11378             {
11379               gcc_assert (i == 0);
11380               tree adjusted_len =
11381                 make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
11382               SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
11383               rgl->bias_adjusted_ctrl = adjusted_len;
11384             }
11385         }
11386     }
11387
11388   if (use_bias_adjusted_len)
11389     return rgl->bias_adjusted_ctrl;
11390
11391   tree loop_len = rgl->controls[index];
11392   if (rgl->factor == 1 && factor == 1)
11393     {
11394       poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
11395       poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
11396       if (maybe_ne (nunits1, nunits2))
11397         {
11398           /* A loop len for data type X can be reused for data type Y
11399              if X has N times more elements than Y and if Y's elements
11400              are N times bigger than X's.  */
11401           gcc_assert (multiple_p (nunits1, nunits2));
11402           factor = exact_div (nunits1, nunits2).to_constant ();
11403           tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
11404           gimple_seq seq = NULL;
11405           loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
11406                                    build_int_cst (iv_type, factor));
11407           if (seq)
11408             gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11409         }
11410     }
11411   return loop_len;
11412 }
11413
11414 /* Scale profiling counters by estimation for LOOP which is vectorized
11415    by factor VF.
11416    If FLAT is true, the loop we started with had unrealistically flat
11417    profile.  */
11418
11419 static void
11420 scale_profile_for_vect_loop (class loop *loop, edge exit_e, unsigned vf, bool flat)
11421 {
11422   /* For flat profiles do not scale down proportionally by VF and only
11423      cap by known iteration count bounds.  */
11424   if (flat)
11425     {
11426       if (dump_file && (dump_flags & TDF_DETAILS))
11427         fprintf (dump_file,
11428                  "Vectorized loop profile seems flat; not scaling iteration "
11429                  "count down by the vectorization factor %i\n", vf);
11430       scale_loop_profile (loop, profile_probability::always (),
11431                           get_likely_max_loop_iterations_int (loop));
11432       return;
11433     }
11434   /* Loop body executes VF fewer times and exit increases VF times.  */
11435   profile_count entry_count = loop_preheader_edge (loop)->count ();
11436
11437   /* If we have unreliable loop profile avoid dropping entry
11438      count bellow header count.  This can happen since loops
11439      has unrealistically low trip counts.  */
11440   while (vf > 1
11441          && loop->header->count > entry_count
11442          && loop->header->count < entry_count * vf)
11443     {
11444       if (dump_file && (dump_flags & TDF_DETAILS))
11445         fprintf (dump_file,
11446                  "Vectorization factor %i seems too large for profile "
11447                  "prevoiusly believed to be consistent; reducing.\n", vf);
11448       vf /= 2;
11449     }
11450
11451   if (entry_count.nonzero_p ())
11452     set_edge_probability_and_rescale_others
11453             (exit_e,
11454              entry_count.probability_in (loop->header->count / vf));
11455   /* Avoid producing very large exit probability when we do not have
11456      sensible profile.  */
11457   else if (exit_e->probability < profile_probability::always () / (vf * 2))
11458     set_edge_probability_and_rescale_others (exit_e, exit_e->probability * vf);
11459   loop->latch->count = single_pred_edge (loop->latch)->count ();
11460
11461   scale_loop_profile (loop, profile_probability::always () / vf,
11462                       get_likely_max_loop_iterations_int (loop));
11463 }
11464
11465 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
11466    latch edge values originally defined by it.  */
11467
11468 static void
11469 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
11470                                      stmt_vec_info def_stmt_info)
11471 {
11472   tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
11473   if (!def || TREE_CODE (def) != SSA_NAME)
11474     return;
11475   stmt_vec_info phi_info;
11476   imm_use_iterator iter;
11477   use_operand_p use_p;
11478   FOR_EACH_IMM_USE_FAST (use_p, iter, def)
11479     {
11480       gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p));
11481       if (!phi)
11482         continue;
11483       if (!(gimple_bb (phi)->loop_father->header == gimple_bb (phi)
11484             && (phi_info = loop_vinfo->lookup_stmt (phi))
11485             && STMT_VINFO_RELEVANT_P (phi_info)))
11486         continue;
11487       loop_p loop = gimple_bb (phi)->loop_father;
11488       edge e = loop_latch_edge (loop);
11489       if (PHI_ARG_DEF_FROM_EDGE (phi, e) != def)
11490         continue;
11491
11492       if (VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
11493           && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
11494           && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
11495         {
11496           vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11497           vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11498           gcc_assert (phi_defs.length () == latch_defs.length ());
11499           for (unsigned i = 0; i < phi_defs.length (); ++i)
11500             add_phi_arg (as_a <gphi *> (phi_defs[i]),
11501                          gimple_get_lhs (latch_defs[i]), e,
11502                          gimple_phi_arg_location (phi, e->dest_idx));
11503         }
11504       else if (STMT_VINFO_DEF_TYPE (phi_info) == vect_first_order_recurrence)
11505         {
11506           /* For first order recurrences we have to update both uses of
11507              the latch definition, the one in the PHI node and the one
11508              in the generated VEC_PERM_EXPR.  */
11509           vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11510           vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11511           gcc_assert (phi_defs.length () == latch_defs.length ());
11512           tree phidef = gimple_assign_rhs1 (phi_defs[0]);
11513           gphi *vphi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
11514           for (unsigned i = 0; i < phi_defs.length (); ++i)
11515             {
11516               gassign *perm = as_a <gassign *> (phi_defs[i]);
11517               if (i > 0)
11518                 gimple_assign_set_rhs1 (perm, gimple_get_lhs (latch_defs[i-1]));
11519               gimple_assign_set_rhs2 (perm, gimple_get_lhs (latch_defs[i]));
11520               update_stmt (perm);
11521             }
11522           add_phi_arg (vphi, gimple_get_lhs (latch_defs.last ()), e,
11523                        gimple_phi_arg_location (phi, e->dest_idx));
11524         }
11525     }
11526 }
11527
11528 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
11529    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
11530    stmt_vec_info.  */
11531
11532 static bool
11533 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
11534                           gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
11535 {
11536   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11537   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11538
11539   if (dump_enabled_p ())
11540     dump_printf_loc (MSG_NOTE, vect_location,
11541                      "------>vectorizing statement: %G", stmt_info->stmt);
11542
11543   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
11544     vect_loop_kill_debug_uses (loop, stmt_info);
11545
11546   if (!STMT_VINFO_RELEVANT_P (stmt_info)
11547       && !STMT_VINFO_LIVE_P (stmt_info))
11548     {
11549       if (is_gimple_call (stmt_info->stmt)
11550           && gimple_call_internal_p (stmt_info->stmt, IFN_MASK_CALL))
11551         {
11552           gcc_assert (!gimple_call_lhs (stmt_info->stmt));
11553           *seen_store = stmt_info;
11554           return false;
11555         }
11556       return false;
11557     }
11558
11559   if (STMT_VINFO_VECTYPE (stmt_info))
11560     {
11561       poly_uint64 nunits
11562         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
11563       if (!STMT_SLP_TYPE (stmt_info)
11564           && maybe_ne (nunits, vf)
11565           && dump_enabled_p ())
11566         /* For SLP VF is set according to unrolling factor, and not
11567            to vector size, hence for SLP this print is not valid.  */
11568         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
11569     }
11570
11571   /* Pure SLP statements have already been vectorized.  We still need
11572      to apply loop vectorization to hybrid SLP statements.  */
11573   if (PURE_SLP_STMT (stmt_info))
11574     return false;
11575
11576   if (dump_enabled_p ())
11577     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
11578
11579   if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
11580     *seen_store = stmt_info;
11581
11582   return true;
11583 }
11584
11585 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
11586    in the hash_map with its corresponding values.  */
11587
11588 static tree
11589 find_in_mapping (tree t, void *context)
11590 {
11591   hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
11592
11593   tree *value = mapping->get (t);
11594   return value ? *value : t;
11595 }
11596
11597 /* Update EPILOGUE's loop_vec_info.  EPILOGUE was constructed as a copy of the
11598    original loop that has now been vectorized.
11599
11600    The inits of the data_references need to be advanced with the number of
11601    iterations of the main loop.  This has been computed in vect_do_peeling and
11602    is stored in parameter ADVANCE.  We first restore the data_references
11603    initial offset with the values recored in ORIG_DRS_INIT.
11604
11605    Since the loop_vec_info of this EPILOGUE was constructed for the original
11606    loop, its stmt_vec_infos all point to the original statements.  These need
11607    to be updated to point to their corresponding copies as well as the SSA_NAMES
11608    in their PATTERN_DEF_SEQs and RELATED_STMTs.
11609
11610    The data_reference's connections also need to be updated.  Their
11611    corresponding dr_vec_info need to be reconnected to the EPILOGUE's
11612    stmt_vec_infos, their statements need to point to their corresponding copy,
11613    if they are gather loads or scatter stores then their reference needs to be
11614    updated to point to its corresponding copy and finally we set
11615    'base_misaligned' to false as we have already peeled for alignment in the
11616    prologue of the main loop.  */
11617
11618 static void
11619 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
11620 {
11621   loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
11622   auto_vec<gimple *> stmt_worklist;
11623   hash_map<tree,tree> mapping;
11624   gimple *orig_stmt, *new_stmt;
11625   gimple_stmt_iterator epilogue_gsi;
11626   gphi_iterator epilogue_phi_gsi;
11627   stmt_vec_info stmt_vinfo = NULL, related_vinfo;
11628   basic_block *epilogue_bbs = get_loop_body (epilogue);
11629   unsigned i;
11630
11631   free (LOOP_VINFO_BBS (epilogue_vinfo));
11632   LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
11633
11634   /* Advance data_reference's with the number of iterations of the previous
11635      loop and its prologue.  */
11636   vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
11637
11638
11639   /* The EPILOGUE loop is a copy of the original loop so they share the same
11640      gimple UIDs.  In this loop we update the loop_vec_info of the EPILOGUE to
11641      point to the copied statements.  We also create a mapping of all LHS' in
11642      the original loop and all the LHS' in the EPILOGUE and create worklists to
11643      update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs.  */
11644   for (unsigned i = 0; i < epilogue->num_nodes; ++i)
11645     {
11646       for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
11647            !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
11648         {
11649           new_stmt = epilogue_phi_gsi.phi ();
11650
11651           gcc_assert (gimple_uid (new_stmt) > 0);
11652           stmt_vinfo
11653             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11654
11655           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11656           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11657
11658           mapping.put (gimple_phi_result (orig_stmt),
11659                        gimple_phi_result (new_stmt));
11660           /* PHI nodes can not have patterns or related statements.  */
11661           gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
11662                       && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
11663         }
11664
11665       for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
11666            !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
11667         {
11668           new_stmt = gsi_stmt (epilogue_gsi);
11669           if (is_gimple_debug (new_stmt))
11670             continue;
11671
11672           gcc_assert (gimple_uid (new_stmt) > 0);
11673           stmt_vinfo
11674             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11675
11676           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11677           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11678
11679           if (tree old_lhs = gimple_get_lhs (orig_stmt))
11680             mapping.put (old_lhs, gimple_get_lhs (new_stmt));
11681
11682           if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
11683             {
11684               gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
11685               for (gimple_stmt_iterator gsi = gsi_start (seq);
11686                    !gsi_end_p (gsi); gsi_next (&gsi))
11687                 stmt_worklist.safe_push (gsi_stmt (gsi));
11688             }
11689
11690           related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
11691           if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
11692             {
11693               gimple *stmt = STMT_VINFO_STMT (related_vinfo);
11694               stmt_worklist.safe_push (stmt);
11695               /* Set BB such that the assert in
11696                 'get_initial_def_for_reduction' is able to determine that
11697                 the BB of the related stmt is inside this loop.  */
11698               gimple_set_bb (stmt,
11699                              gimple_bb (new_stmt));
11700               related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
11701               gcc_assert (related_vinfo == NULL
11702                           || related_vinfo == stmt_vinfo);
11703             }
11704         }
11705     }
11706
11707   /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
11708      using the original main loop and thus need to be updated to refer to the
11709      cloned variables used in the epilogue.  */
11710   for (unsigned i = 0; i < stmt_worklist.length (); ++i)
11711     {
11712       gimple *stmt = stmt_worklist[i];
11713       tree *new_op;
11714
11715       for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
11716         {
11717           tree op = gimple_op (stmt, j);
11718           if ((new_op = mapping.get(op)))
11719             gimple_set_op (stmt, j, *new_op);
11720           else
11721             {
11722               /* PR92429: The last argument of simplify_replace_tree disables
11723                  folding when replacing arguments.  This is required as
11724                  otherwise you might end up with different statements than the
11725                  ones analyzed in vect_loop_analyze, leading to different
11726                  vectorization.  */
11727               op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
11728                                           &find_in_mapping, &mapping, false);
11729               gimple_set_op (stmt, j, op);
11730             }
11731         }
11732     }
11733
11734   struct data_reference *dr;
11735   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
11736   FOR_EACH_VEC_ELT (datarefs, i, dr)
11737     {
11738       orig_stmt = DR_STMT (dr);
11739       gcc_assert (gimple_uid (orig_stmt) > 0);
11740       stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
11741       /* Data references for gather loads and scatter stores do not use the
11742          updated offset we set using ADVANCE.  Instead we have to make sure the
11743          reference in the data references point to the corresponding copy of
11744          the original in the epilogue.  Make sure to update both
11745          gather/scatters recognized by dataref analysis and also other
11746          refs that get_load_store_type classified as VMAT_GATHER_SCATTER.  */
11747       auto vstmt_vinfo = vect_stmt_to_vectorize (stmt_vinfo);
11748       if (STMT_VINFO_MEMORY_ACCESS_TYPE (vstmt_vinfo) == VMAT_GATHER_SCATTER
11749           || STMT_VINFO_GATHER_SCATTER_P (vstmt_vinfo))
11750         {
11751           DR_REF (dr)
11752             = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
11753                                      &find_in_mapping, &mapping);
11754           DR_BASE_ADDRESS (dr)
11755             = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
11756                                      &find_in_mapping, &mapping);
11757         }
11758       DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
11759       stmt_vinfo->dr_aux.stmt = stmt_vinfo;
11760       /* The vector size of the epilogue is smaller than that of the main loop
11761          so the alignment is either the same or lower. This means the dr will
11762          thus by definition be aligned.  */
11763       STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
11764     }
11765
11766   epilogue_vinfo->shared->datarefs_copy.release ();
11767   epilogue_vinfo->shared->save_datarefs ();
11768 }
11769
11770 /*  When vectorizing early break statements instructions that happen before
11771     the early break in the current BB need to be moved to after the early
11772     break.  This function deals with that and assumes that any validity
11773     checks has already been performed.
11774
11775     While moving the instructions if it encounters a VUSE or VDEF it then
11776     corrects the VUSES as it moves the statements along.  GDEST is the location
11777     in which to insert the new statements.  */
11778
11779 static void
11780 move_early_exit_stmts (loop_vec_info loop_vinfo)
11781 {
11782   DUMP_VECT_SCOPE ("move_early_exit_stmts");
11783
11784   if (LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).is_empty ())
11785     return;
11786
11787   /* Move all stmts that need moving.  */
11788   basic_block dest_bb = LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo);
11789   gimple_stmt_iterator dest_gsi = gsi_start_bb (dest_bb);
11790
11791   for (gimple *stmt : LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo))
11792     {
11793       /* Check to see if statement is still required for vect or has been
11794          elided.  */
11795       auto stmt_info = loop_vinfo->lookup_stmt (stmt);
11796       if (!stmt_info)
11797         continue;
11798
11799       if (dump_enabled_p ())
11800         dump_printf_loc (MSG_NOTE, vect_location, "moving stmt %G", stmt);
11801
11802       gimple_stmt_iterator stmt_gsi = gsi_for_stmt (stmt);
11803       gsi_move_before (&stmt_gsi, &dest_gsi);
11804       gsi_prev (&dest_gsi);
11805     }
11806
11807   /* Update all the stmts with their new reaching VUSES.  */
11808   tree vuse
11809     = gimple_vuse (LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).last ());
11810   for (auto p : LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo))
11811     {
11812       if (dump_enabled_p ())
11813           dump_printf_loc (MSG_NOTE, vect_location,
11814                            "updating vuse to %T for load %G", vuse, p);
11815       gimple_set_vuse (p, vuse);
11816       update_stmt (p);
11817     }
11818
11819   /* And update the LC PHIs on exits.  */
11820   for (edge e : get_loop_exit_edges (LOOP_VINFO_LOOP  (loop_vinfo)))
11821     if (!dominated_by_p (CDI_DOMINATORS, e->src, dest_bb))
11822       if (gphi *phi = get_virtual_phi (e->dest))
11823         SET_PHI_ARG_DEF_ON_EDGE (phi, e, vuse);
11824 }
11825
11826 /* Function vect_transform_loop.
11827
11828    The analysis phase has determined that the loop is vectorizable.
11829    Vectorize the loop - created vectorized stmts to replace the scalar
11830    stmts in the loop, and update the loop exit condition.
11831    Returns scalar epilogue loop if any.  */
11832
11833 class loop *
11834 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
11835 {
11836   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11837   class loop *epilogue = NULL;
11838   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
11839   int nbbs = loop->num_nodes;
11840   int i;
11841   tree niters_vector = NULL_TREE;
11842   tree step_vector = NULL_TREE;
11843   tree niters_vector_mult_vf = NULL_TREE;
11844   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11845   unsigned int lowest_vf = constant_lower_bound (vf);
11846   gimple *stmt;
11847   bool check_profitability = false;
11848   unsigned int th;
11849   bool flat = maybe_flat_loop_profile (loop);
11850
11851   DUMP_VECT_SCOPE ("vec_transform_loop");
11852
11853   loop_vinfo->shared->check_datarefs ();
11854
11855   /* Use the more conservative vectorization threshold.  If the number
11856      of iterations is constant assume the cost check has been performed
11857      by our caller.  If the threshold makes all loops profitable that
11858      run at least the (estimated) vectorization factor number of times
11859      checking is pointless, too.  */
11860   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
11861   if (vect_apply_runtime_profitability_check_p (loop_vinfo))
11862     {
11863       if (dump_enabled_p ())
11864         dump_printf_loc (MSG_NOTE, vect_location,
11865                          "Profitability threshold is %d loop iterations.\n",
11866                          th);
11867       check_profitability = true;
11868     }
11869
11870   /* Make sure there exists a single-predecessor exit bb.  Do this before
11871      versioning.   */
11872   edge e = LOOP_VINFO_IV_EXIT (loop_vinfo);
11873   if (! single_pred_p (e->dest) && !LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11874     {
11875       split_loop_exit_edge (e, true);
11876       if (dump_enabled_p ())
11877         dump_printf (MSG_NOTE, "split exit edge\n");
11878     }
11879
11880   /* Version the loop first, if required, so the profitability check
11881      comes first.  */
11882
11883   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
11884     {
11885       class loop *sloop
11886         = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
11887       sloop->force_vectorize = false;
11888       check_profitability = false;
11889     }
11890
11891   /* Make sure there exists a single-predecessor exit bb also on the
11892      scalar loop copy.  Do this after versioning but before peeling
11893      so CFG structure is fine for both scalar and if-converted loop
11894      to make slpeel_duplicate_current_defs_from_edges face matched
11895      loop closed PHI nodes on the exit.  */
11896   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
11897     {
11898       e = LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo);
11899       if (! single_pred_p (e->dest))
11900         {
11901           split_loop_exit_edge (e, true);
11902           if (dump_enabled_p ())
11903             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
11904         }
11905     }
11906
11907   tree niters = vect_build_loop_niters (loop_vinfo);
11908   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
11909   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
11910   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
11911   tree advance;
11912   drs_init_vec orig_drs_init;
11913
11914   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
11915                               &step_vector, &niters_vector_mult_vf, th,
11916                               check_profitability, niters_no_overflow,
11917                               &advance);
11918   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
11919       && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
11920     {
11921       /* Ifcvt duplicates loop preheader, loop body and produces an basic
11922          block after loop exit.  We need to scale all that.  */
11923       basic_block preheader
11924         = loop_preheader_edge (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->src;
11925       preheader->count
11926         = preheader->count.apply_probability
11927               (LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11928       scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
11929                               LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11930       LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo)->dest->count = preheader->count;
11931     }
11932
11933   if (niters_vector == NULL_TREE)
11934     {
11935       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
11936           && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
11937           && known_eq (lowest_vf, vf))
11938         {
11939           niters_vector
11940             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
11941                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
11942           step_vector = build_one_cst (TREE_TYPE (niters));
11943         }
11944       else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11945         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
11946                                      &step_vector, niters_no_overflow);
11947       else
11948         /* vect_do_peeling subtracted the number of peeled prologue
11949            iterations from LOOP_VINFO_NITERS.  */
11950         vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
11951                                      &niters_vector, &step_vector,
11952                                      niters_no_overflow);
11953     }
11954
11955   /* 1) Make sure the loop header has exactly two entries
11956      2) Make sure we have a preheader basic block.  */
11957
11958   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
11959
11960   split_edge (loop_preheader_edge (loop));
11961
11962   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11963     /* This will deal with any possible peeling.  */
11964     vect_prepare_for_masked_peels (loop_vinfo);
11965
11966   /* Handle any code motion that we need to for early-break vectorization after
11967      we've done peeling but just before we start vectorizing.  */
11968   if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11969     move_early_exit_stmts (loop_vinfo);
11970
11971   /* Schedule the SLP instances first, then handle loop vectorization
11972      below.  */
11973   if (!loop_vinfo->slp_instances.is_empty ())
11974     {
11975       DUMP_VECT_SCOPE ("scheduling SLP instances");
11976       vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
11977     }
11978
11979   /* FORNOW: the vectorizer supports only loops which body consist
11980      of one basic block (header + empty latch). When the vectorizer will
11981      support more involved loop forms, the order by which the BBs are
11982      traversed need to be reconsidered.  */
11983
11984   for (i = 0; i < nbbs; i++)
11985     {
11986       basic_block bb = bbs[i];
11987       stmt_vec_info stmt_info;
11988
11989       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
11990            gsi_next (&si))
11991         {
11992           gphi *phi = si.phi ();
11993           if (dump_enabled_p ())
11994             dump_printf_loc (MSG_NOTE, vect_location,
11995                              "------>vectorizing phi: %G", (gimple *) phi);
11996           stmt_info = loop_vinfo->lookup_stmt (phi);
11997           if (!stmt_info)
11998             continue;
11999
12000           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
12001             vect_loop_kill_debug_uses (loop, stmt_info);
12002
12003           if (!STMT_VINFO_RELEVANT_P (stmt_info)
12004               && !STMT_VINFO_LIVE_P (stmt_info))
12005             continue;
12006
12007           if (STMT_VINFO_VECTYPE (stmt_info)
12008               && (maybe_ne
12009                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
12010               && dump_enabled_p ())
12011             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
12012
12013           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
12014                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
12015                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
12016                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
12017                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence
12018                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
12019               && ! PURE_SLP_STMT (stmt_info))
12020             {
12021               if (dump_enabled_p ())
12022                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
12023               vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
12024             }
12025         }
12026
12027       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
12028            gsi_next (&si))
12029         {
12030           gphi *phi = si.phi ();
12031           stmt_info = loop_vinfo->lookup_stmt (phi);
12032           if (!stmt_info)
12033             continue;
12034
12035           if (!STMT_VINFO_RELEVANT_P (stmt_info)
12036               && !STMT_VINFO_LIVE_P (stmt_info))
12037             continue;
12038
12039           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
12040                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
12041                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
12042                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
12043                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
12044                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence)
12045               && ! PURE_SLP_STMT (stmt_info))
12046             maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
12047         }
12048
12049       for (gimple_stmt_iterator si = gsi_start_bb (bb);
12050            !gsi_end_p (si);)
12051         {
12052           stmt = gsi_stmt (si);
12053           /* During vectorization remove existing clobber stmts.  */
12054           if (gimple_clobber_p (stmt))
12055             {
12056               unlink_stmt_vdef (stmt);
12057               gsi_remove (&si, true);
12058               release_defs (stmt);
12059             }
12060           else
12061             {
12062               /* Ignore vector stmts created in the outer loop.  */
12063               stmt_info = loop_vinfo->lookup_stmt (stmt);
12064
12065               /* vector stmts created in the outer-loop during vectorization of
12066                  stmts in an inner-loop may not have a stmt_info, and do not
12067                  need to be vectorized.  */
12068               stmt_vec_info seen_store = NULL;
12069               if (stmt_info)
12070                 {
12071                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
12072                     {
12073                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
12074                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
12075                            !gsi_end_p (subsi); gsi_next (&subsi))
12076                         {
12077                           stmt_vec_info pat_stmt_info
12078                             = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
12079                           vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
12080                                                     &si, &seen_store);
12081                         }
12082                       stmt_vec_info pat_stmt_info
12083                         = STMT_VINFO_RELATED_STMT (stmt_info);
12084                       if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
12085                                                     &si, &seen_store))
12086                         maybe_set_vectorized_backedge_value (loop_vinfo,
12087                                                              pat_stmt_info);
12088                     }
12089                   else
12090                     {
12091                       if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
12092                                                     &seen_store))
12093                         maybe_set_vectorized_backedge_value (loop_vinfo,
12094                                                              stmt_info);
12095                     }
12096                 }
12097               gsi_next (&si);
12098               if (seen_store)
12099                 {
12100                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
12101                     /* Interleaving.  If IS_STORE is TRUE, the
12102                        vectorization of the interleaving chain was
12103                        completed - free all the stores in the chain.  */
12104                     vect_remove_stores (loop_vinfo,
12105                                         DR_GROUP_FIRST_ELEMENT (seen_store));
12106                   else
12107                     /* Free the attached stmt_vec_info and remove the stmt.  */
12108                     loop_vinfo->remove_stmt (stmt_info);
12109                 }
12110             }
12111         }
12112
12113       /* Stub out scalar statements that must not survive vectorization.
12114          Doing this here helps with grouped statements, or statements that
12115          are involved in patterns.  */
12116       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
12117            !gsi_end_p (gsi); gsi_next (&gsi))
12118         {
12119           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
12120           if (!call || !gimple_call_internal_p (call))
12121             continue;
12122           internal_fn ifn = gimple_call_internal_fn (call);
12123           if (ifn == IFN_MASK_LOAD)
12124             {
12125               tree lhs = gimple_get_lhs (call);
12126               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12127                 {
12128                   tree zero = build_zero_cst (TREE_TYPE (lhs));
12129                   gimple *new_stmt = gimple_build_assign (lhs, zero);
12130                   gsi_replace (&gsi, new_stmt, true);
12131                 }
12132             }
12133           else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
12134             {
12135               tree lhs = gimple_get_lhs (call);
12136               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12137                 {
12138                   tree else_arg
12139                     = gimple_call_arg (call, gimple_call_num_args (call) - 1);
12140                   gimple *new_stmt = gimple_build_assign (lhs, else_arg);
12141                   gsi_replace (&gsi, new_stmt, true);
12142                 }
12143             }
12144         }
12145     }                           /* BBs in loop */
12146
12147   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
12148      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
12149   if (integer_onep (step_vector))
12150     niters_no_overflow = true;
12151   vect_set_loop_condition (loop, LOOP_VINFO_IV_EXIT (loop_vinfo), loop_vinfo,
12152                            niters_vector, step_vector, niters_vector_mult_vf,
12153                            !niters_no_overflow);
12154
12155   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
12156
12157   /* True if the final iteration might not handle a full vector's
12158      worth of scalar iterations.  */
12159   bool final_iter_may_be_partial
12160     = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
12161   /* The minimum number of iterations performed by the epilogue.  This
12162      is 1 when peeling for gaps because we always need a final scalar
12163      iteration.  */
12164   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
12165   /* +1 to convert latch counts to loop iteration counts,
12166      -min_epilogue_iters to remove iterations that cannot be performed
12167        by the vector code.  */
12168   int bias_for_lowest = 1 - min_epilogue_iters;
12169   int bias_for_assumed = bias_for_lowest;
12170   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
12171   if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
12172     {
12173       /* When the amount of peeling is known at compile time, the first
12174          iteration will have exactly alignment_npeels active elements.
12175          In the worst case it will have at least one.  */
12176       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
12177       bias_for_lowest += lowest_vf - min_first_active;
12178       bias_for_assumed += assumed_vf - min_first_active;
12179     }
12180   /* In these calculations the "- 1" converts loop iteration counts
12181      back to latch counts.  */
12182   if (loop->any_upper_bound)
12183     {
12184       loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
12185       loop->nb_iterations_upper_bound
12186         = (final_iter_may_be_partial
12187            ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
12188                             lowest_vf) - 1
12189            : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
12190                              lowest_vf) - 1);
12191       if (main_vinfo
12192           /* Both peeling for alignment and peeling for gaps can end up
12193              with the scalar epilogue running for more than VF-1 iterations.  */
12194           && !main_vinfo->peeling_for_alignment
12195           && !main_vinfo->peeling_for_gaps)
12196         {
12197           unsigned int bound;
12198           poly_uint64 main_iters
12199             = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
12200                            LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
12201           main_iters
12202             = upper_bound (main_iters,
12203                            LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
12204           if (can_div_away_from_zero_p (main_iters,
12205                                         LOOP_VINFO_VECT_FACTOR (loop_vinfo),
12206                                         &bound))
12207             loop->nb_iterations_upper_bound
12208               = wi::umin ((bound_wide_int) (bound - 1),
12209                           loop->nb_iterations_upper_bound);
12210       }
12211   }
12212   if (loop->any_likely_upper_bound)
12213     loop->nb_iterations_likely_upper_bound
12214       = (final_iter_may_be_partial
12215          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
12216                           + bias_for_lowest, lowest_vf) - 1
12217          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
12218                            + bias_for_lowest, lowest_vf) - 1);
12219   if (loop->any_estimate)
12220     loop->nb_iterations_estimate
12221       = (final_iter_may_be_partial
12222          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
12223                           assumed_vf) - 1
12224          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
12225                            assumed_vf) - 1);
12226   scale_profile_for_vect_loop (loop, LOOP_VINFO_IV_EXIT (loop_vinfo),
12227                                assumed_vf, flat);
12228
12229   if (dump_enabled_p ())
12230     {
12231       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
12232         {
12233           dump_printf_loc (MSG_NOTE, vect_location,
12234                            "LOOP VECTORIZED\n");
12235           if (loop->inner)
12236             dump_printf_loc (MSG_NOTE, vect_location,
12237                              "OUTER LOOP VECTORIZED\n");
12238           dump_printf (MSG_NOTE, "\n");
12239         }
12240       else
12241         dump_printf_loc (MSG_NOTE, vect_location,
12242                          "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
12243                          GET_MODE_NAME (loop_vinfo->vector_mode));
12244     }
12245
12246   /* Loops vectorized with a variable factor won't benefit from
12247      unrolling/peeling.  */
12248   if (!vf.is_constant ())
12249     {
12250       loop->unroll = 1;
12251       if (dump_enabled_p ())
12252         dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
12253                          " variable-length vectorization factor\n");
12254     }
12255   /* Free SLP instances here because otherwise stmt reference counting
12256      won't work.  */
12257   slp_instance instance;
12258   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
12259     vect_free_slp_instance (instance);
12260   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
12261   /* Clear-up safelen field since its value is invalid after vectorization
12262      since vectorized loop can have loop-carried dependencies.  */
12263   loop->safelen = 0;
12264
12265   if (epilogue)
12266     {
12267       update_epilogue_loop_vinfo (epilogue, advance);
12268
12269       epilogue->simduid = loop->simduid;
12270       epilogue->force_vectorize = loop->force_vectorize;
12271       epilogue->dont_vectorize = false;
12272     }
12273
12274   return epilogue;
12275 }
12276
12277 /* The code below is trying to perform simple optimization - revert
12278    if-conversion for masked stores, i.e. if the mask of a store is zero
12279    do not perform it and all stored value producers also if possible.
12280    For example,
12281      for (i=0; i<n; i++)
12282        if (c[i])
12283         {
12284           p1[i] += 1;
12285           p2[i] = p3[i] +2;
12286         }
12287    this transformation will produce the following semi-hammock:
12288
12289    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
12290      {
12291        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
12292        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
12293        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
12294        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
12295        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
12296        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
12297      }
12298 */
12299
12300 void
12301 optimize_mask_stores (class loop *loop)
12302 {
12303   basic_block *bbs = get_loop_body (loop);
12304   unsigned nbbs = loop->num_nodes;
12305   unsigned i;
12306   basic_block bb;
12307   class loop *bb_loop;
12308   gimple_stmt_iterator gsi;
12309   gimple *stmt;
12310   auto_vec<gimple *> worklist;
12311   auto_purge_vect_location sentinel;
12312
12313   vect_location = find_loop_location (loop);
12314   /* Pick up all masked stores in loop if any.  */
12315   for (i = 0; i < nbbs; i++)
12316     {
12317       bb = bbs[i];
12318       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
12319            gsi_next (&gsi))
12320         {
12321           stmt = gsi_stmt (gsi);
12322           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
12323             worklist.safe_push (stmt);
12324         }
12325     }
12326
12327   free (bbs);
12328   if (worklist.is_empty ())
12329     return;
12330
12331   /* Loop has masked stores.  */
12332   while (!worklist.is_empty ())
12333     {
12334       gimple *last, *last_store;
12335       edge e, efalse;
12336       tree mask;
12337       basic_block store_bb, join_bb;
12338       gimple_stmt_iterator gsi_to;
12339       tree vdef, new_vdef;
12340       gphi *phi;
12341       tree vectype;
12342       tree zero;
12343
12344       last = worklist.pop ();
12345       mask = gimple_call_arg (last, 2);
12346       bb = gimple_bb (last);
12347       /* Create then_bb and if-then structure in CFG, then_bb belongs to
12348          the same loop as if_bb.  It could be different to LOOP when two
12349          level loop-nest is vectorized and mask_store belongs to the inner
12350          one.  */
12351       e = split_block (bb, last);
12352       bb_loop = bb->loop_father;
12353       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
12354       join_bb = e->dest;
12355       store_bb = create_empty_bb (bb);
12356       add_bb_to_loop (store_bb, bb_loop);
12357       e->flags = EDGE_TRUE_VALUE;
12358       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
12359       /* Put STORE_BB to likely part.  */
12360       efalse->probability = profile_probability::likely ();
12361       e->probability = efalse->probability.invert ();
12362       store_bb->count = efalse->count ();
12363       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
12364       if (dom_info_available_p (CDI_DOMINATORS))
12365         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
12366       if (dump_enabled_p ())
12367         dump_printf_loc (MSG_NOTE, vect_location,
12368                          "Create new block %d to sink mask stores.",
12369                          store_bb->index);
12370       /* Create vector comparison with boolean result.  */
12371       vectype = TREE_TYPE (mask);
12372       zero = build_zero_cst (vectype);
12373       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
12374       gsi = gsi_last_bb (bb);
12375       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
12376       /* Create new PHI node for vdef of the last masked store:
12377          .MEM_2 = VDEF <.MEM_1>
12378          will be converted to
12379          .MEM.3 = VDEF <.MEM_1>
12380          and new PHI node will be created in join bb
12381          .MEM_2 = PHI <.MEM_1, .MEM_3>
12382       */
12383       vdef = gimple_vdef (last);
12384       new_vdef = make_ssa_name (gimple_vop (cfun), last);
12385       gimple_set_vdef (last, new_vdef);
12386       phi = create_phi_node (vdef, join_bb);
12387       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
12388
12389       /* Put all masked stores with the same mask to STORE_BB if possible.  */
12390       while (true)
12391         {
12392           gimple_stmt_iterator gsi_from;
12393           gimple *stmt1 = NULL;
12394
12395           /* Move masked store to STORE_BB.  */
12396           last_store = last;
12397           gsi = gsi_for_stmt (last);
12398           gsi_from = gsi;
12399           /* Shift GSI to the previous stmt for further traversal.  */
12400           gsi_prev (&gsi);
12401           gsi_to = gsi_start_bb (store_bb);
12402           gsi_move_before (&gsi_from, &gsi_to);
12403           /* Setup GSI_TO to the non-empty block start.  */
12404           gsi_to = gsi_start_bb (store_bb);
12405           if (dump_enabled_p ())
12406             dump_printf_loc (MSG_NOTE, vect_location,
12407                              "Move stmt to created bb\n%G", last);
12408           /* Move all stored value producers if possible.  */
12409           while (!gsi_end_p (gsi))
12410             {
12411               tree lhs;
12412               imm_use_iterator imm_iter;
12413               use_operand_p use_p;
12414               bool res;
12415
12416               /* Skip debug statements.  */
12417               if (is_gimple_debug (gsi_stmt (gsi)))
12418                 {
12419                   gsi_prev (&gsi);
12420                   continue;
12421                 }
12422               stmt1 = gsi_stmt (gsi);
12423               /* Do not consider statements writing to memory or having
12424                  volatile operand.  */
12425               if (gimple_vdef (stmt1)
12426                   || gimple_has_volatile_ops (stmt1))
12427                 break;
12428               gsi_from = gsi;
12429               gsi_prev (&gsi);
12430               lhs = gimple_get_lhs (stmt1);
12431               if (!lhs)
12432                 break;
12433
12434               /* LHS of vectorized stmt must be SSA_NAME.  */
12435               if (TREE_CODE (lhs) != SSA_NAME)
12436                 break;
12437
12438               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12439                 {
12440                   /* Remove dead scalar statement.  */
12441                   if (has_zero_uses (lhs))
12442                     {
12443                       gsi_remove (&gsi_from, true);
12444                       continue;
12445                     }
12446                 }
12447
12448               /* Check that LHS does not have uses outside of STORE_BB.  */
12449               res = true;
12450               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
12451                 {
12452                   gimple *use_stmt;
12453                   use_stmt = USE_STMT (use_p);
12454                   if (is_gimple_debug (use_stmt))
12455                     continue;
12456                   if (gimple_bb (use_stmt) != store_bb)
12457                     {
12458                       res = false;
12459                       break;
12460                     }
12461                 }
12462               if (!res)
12463                 break;
12464
12465               if (gimple_vuse (stmt1)
12466                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
12467                 break;
12468
12469               /* Can move STMT1 to STORE_BB.  */
12470               if (dump_enabled_p ())
12471                 dump_printf_loc (MSG_NOTE, vect_location,
12472                                  "Move stmt to created bb\n%G", stmt1);
12473               gsi_move_before (&gsi_from, &gsi_to);
12474               /* Shift GSI_TO for further insertion.  */
12475               gsi_prev (&gsi_to);
12476             }
12477           /* Put other masked stores with the same mask to STORE_BB.  */
12478           if (worklist.is_empty ()
12479               || gimple_call_arg (worklist.last (), 2) != mask
12480               || worklist.last () != stmt1)
12481             break;
12482           last = worklist.pop ();
12483         }
12484       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
12485     }
12486 }
12487
12488 /* Decide whether it is possible to use a zero-based induction variable
12489    when vectorizing LOOP_VINFO with partial vectors.  If it is, return
12490    the value that the induction variable must be able to hold in order
12491    to ensure that the rgroups eventually have no active vector elements.
12492    Return -1 otherwise.  */
12493
12494 widest_int
12495 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
12496 {
12497   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
12498   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
12499   unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
12500
12501   /* Calculate the value that the induction variable must be able
12502      to hit in order to ensure that we end the loop with an all-false mask.
12503      This involves adding the maximum number of inactive trailing scalar
12504      iterations.  */
12505   widest_int iv_limit = -1;
12506   if (max_loop_iterations (loop, &iv_limit))
12507     {
12508       if (niters_skip)
12509         {
12510           /* Add the maximum number of skipped iterations to the
12511              maximum iteration count.  */
12512           if (TREE_CODE (niters_skip) == INTEGER_CST)
12513             iv_limit += wi::to_widest (niters_skip);
12514           else
12515             iv_limit += max_vf - 1;
12516         }
12517       else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
12518         /* Make a conservatively-correct assumption.  */
12519         iv_limit += max_vf - 1;
12520
12521       /* IV_LIMIT is the maximum number of latch iterations, which is also
12522          the maximum in-range IV value.  Round this value down to the previous
12523          vector alignment boundary and then add an extra full iteration.  */
12524       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
12525       iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
12526     }
12527   return iv_limit;
12528 }
12529
12530 /* For the given rgroup_controls RGC, check whether an induction variable
12531    would ever hit a value that produces a set of all-false masks or zero
12532    lengths before wrapping around.  Return true if it's possible to wrap
12533    around before hitting the desirable value, otherwise return false.  */
12534
12535 bool
12536 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
12537 {
12538   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
12539
12540   if (iv_limit == -1)
12541     return true;
12542
12543   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
12544   unsigned int compare_precision = TYPE_PRECISION (compare_type);
12545   unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
12546
12547   if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
12548     return true;
12549
12550   return false;
12551 }