gcc/tree-vect-loop.cc

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2023 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #define INCLUDE_ALGORITHM
  23 #include "config.h"
  24 #include "system.h"
  25 #include "coretypes.h"
  26 #include "backend.h"
  27 #include "target.h"
  28 #include "rtl.h"
  29 #include "tree.h"
  30 #include "gimple.h"
  31 #include "cfghooks.h"
  32 #include "tree-pass.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "memmodel.h"
  36 #include "optabs.h"
  37 #include "diagnostic-core.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "cfganal.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop-niter.h"
  47 #include "tree-ssa-loop.h"
  48 #include "cfgloop.h"
  49 #include "tree-scalar-evolution.h"
  50 #include "tree-vectorizer.h"
  51 #include "gimple-fold.h"
  52 #include "cgraph.h"
  53 #include "tree-cfg.h"
  54 #include "tree-if-conv.h"
  55 #include "internal-fn.h"
  56 #include "tree-vector-builder.h"
  57 #include "vec-perm-indices.h"
  58 #include "tree-eh.h"
  59 #include "case-cfn-macros.h"
  60 #include "langhooks.h"
  61
  62 /* Loop Vectorization Pass.
  63
  64    This pass tries to vectorize loops.
  65
  66    For example, the vectorizer transforms the following simple loop:
  67
  68         short a[N]; short b[N]; short c[N]; int i;
  69
  70         for (i=0; i<N; i++){
  71           a[i] = b[i] + c[i];
  72         }
  73
  74    as if it was manually vectorized by rewriting the source code into:
  75
  76         typedef int __attribute__((mode(V8HI))) v8hi;
  77         short a[N];  short b[N]; short c[N];   int i;
  78         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  79         v8hi va, vb, vc;
  80
  81         for (i=0; i<N/8; i++){
  82           vb = pb[i];
  83           vc = pc[i];
  84           va = vb + vc;
  85           pa[i] = va;
  86         }
  87
  88         The main entry to this pass is vectorize_loops(), in which
  89    the vectorizer applies a set of analyses on a given set of loops,
  90    followed by the actual vectorization transformation for the loops that
  91    had successfully passed the analysis phase.
  92         Throughout this pass we make a distinction between two types of
  93    data: scalars (which are represented by SSA_NAMES), and memory references
  94    ("data-refs").  These two types of data require different handling both
  95    during analysis and transformation. The types of data-refs that the
  96    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  97    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  98    accesses are required to have a simple (consecutive) access pattern.
  99
 100    Analysis phase:
 101    ===============
 102         The driver for the analysis phase is vect_analyze_loop().
 103    It applies a set of analyses, some of which rely on the scalar evolution
 104    analyzer (scev) developed by Sebastian Pop.
 105
 106         During the analysis phase the vectorizer records some information
 107    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 108    loop, as well as general information about the loop as a whole, which is
 109    recorded in a "loop_vec_info" struct attached to each loop.
 110
 111    Transformation phase:
 112    =====================
 113         The loop transformation phase scans all the stmts in the loop, and
 114    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 115    the loop that needs to be vectorized.  It inserts the vector code sequence
 116    just before the scalar stmt S, and records a pointer to the vector code
 117    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 118    attached to S).  This pointer will be used for the vectorization of following
 119    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 120    otherwise, we rely on dead code elimination for removing it.
 121
 122         For example, say stmt S1 was vectorized into stmt VS1:
 123
 124    VS1: vb = px[i];
 125    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 126    S2:  a = b;
 127
 128    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 129    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 130    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 131    resulting sequence would be:
 132
 133    VS1: vb = px[i];
 134    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 135    VS2: va = vb;
 136    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 137
 138         Operands that are not SSA_NAMEs, are data-refs that appear in
 139    load/store operations (like 'x[i]' in S1), and are handled differently.
 140
 141    Target modeling:
 142    =================
 143         Currently the only target specific information that is used is the
 144    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 145    Targets that can support different sizes of vectors, for now will need
 146    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 147    flexibility will be added in the future.
 148
 149         Since we only vectorize operations which vector form can be
 150    expressed using existing tree codes, to verify that an operation is
 151    supported, the vectorizer checks the relevant optab at the relevant
 152    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 153    the value found is CODE_FOR_nothing, then there's no target support, and
 154    we can't vectorize the stmt.
 155
 156    For additional information on this project see:
 157    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 158 */
 159
 160 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
 161                                                 unsigned *);
 162 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
 163                                                bool *, bool *, bool);
 164
 165 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 166    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 167    may already be set for general statements (not just data refs).  */
 168
 169 static opt_result
 170 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
 171                               bool vectype_maybe_set_p,
 172                               poly_uint64 *vf)
 173 {
 174   gimple *stmt = stmt_info->stmt;
 175
 176   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 177        && !STMT_VINFO_LIVE_P (stmt_info))
 178       || gimple_clobber_p (stmt))
 179     {
 180       if (dump_enabled_p ())
 181         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 182       return opt_result::success ();
 183     }
 184
 185   tree stmt_vectype, nunits_vectype;
 186   opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
 187                                                    &stmt_vectype,
 188                                                    &nunits_vectype);
 189   if (!res)
 190     return res;
 191
 192   if (stmt_vectype)
 193     {
 194       if (STMT_VINFO_VECTYPE (stmt_info))
 195         /* The only case when a vectype had been already set is for stmts
 196            that contain a data ref, or for "pattern-stmts" (stmts generated
 197            by the vectorizer to represent/replace a certain idiom).  */
 198         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 199                      || vectype_maybe_set_p)
 200                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 201       else
 202         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 203     }
 204
 205   if (nunits_vectype)
 206     vect_update_max_nunits (vf, nunits_vectype);
 207
 208   return opt_result::success ();
 209 }
 210
 211 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 212    types of STMT_INFO and all attached pattern statements and update
 213    the vectorization factor VF accordingly.  Return true on success
 214    or false if something prevented vectorization.  */
 215
 216 static opt_result
 217 vect_determine_vf_for_stmt (vec_info *vinfo,
 218                             stmt_vec_info stmt_info, poly_uint64 *vf)
 219 {
 220   if (dump_enabled_p ())
 221     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
 222                      stmt_info->stmt);
 223   opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
 224   if (!res)
 225     return res;
 226
 227   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 228       && STMT_VINFO_RELATED_STMT (stmt_info))
 229     {
 230       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 231       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
 232
 233       /* If a pattern statement has def stmts, analyze them too.  */
 234       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 235            !gsi_end_p (si); gsi_next (&si))
 236         {
 237           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 238           if (dump_enabled_p ())
 239             dump_printf_loc (MSG_NOTE, vect_location,
 240                              "==> examining pattern def stmt: %G",
 241                              def_stmt_info->stmt);
 242           res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
 243           if (!res)
 244             return res;
 245         }
 246
 247       if (dump_enabled_p ())
 248         dump_printf_loc (MSG_NOTE, vect_location,
 249                          "==> examining pattern statement: %G",
 250                          stmt_info->stmt);
 251       res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
 252       if (!res)
 253         return res;
 254     }
 255
 256   return opt_result::success ();
 257 }
 258
 259 /* Function vect_determine_vectorization_factor
 260
 261    Determine the vectorization factor (VF).  VF is the number of data elements
 262    that are operated upon in parallel in a single iteration of the vectorized
 263    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 264    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 265    elements can fit in a single vector register.
 266
 267    We currently support vectorization of loops in which all types operated upon
 268    are of the same size.  Therefore this function currently sets VF according to
 269    the size of the types operated upon, and fails if there are multiple sizes
 270    in the loop.
 271
 272    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 273    original loop:
 274         for (i=0; i<N; i++){
 275           a[i] = b[i] + c[i];
 276         }
 277
 278    vectorized loop:
 279         for (i=0; i<N; i+=VF){
 280           a[i:VF] = b[i:VF] + c[i:VF];
 281         }
 282 */
 283
 284 static opt_result
 285 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 286 {
 287   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 288   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 289   unsigned nbbs = loop->num_nodes;
 290   poly_uint64 vectorization_factor = 1;
 291   tree scalar_type = NULL_TREE;
 292   gphi *phi;
 293   tree vectype;
 294   stmt_vec_info stmt_info;
 295   unsigned i;
 296
 297   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 298
 299   for (i = 0; i < nbbs; i++)
 300     {
 301       basic_block bb = bbs[i];
 302
 303       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 304            gsi_next (&si))
 305         {
 306           phi = si.phi ();
 307           stmt_info = loop_vinfo->lookup_stmt (phi);
 308           if (dump_enabled_p ())
 309             dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
 310                              (gimple *) phi);
 311
 312           gcc_assert (stmt_info);
 313
 314           if (STMT_VINFO_RELEVANT_P (stmt_info)
 315               || STMT_VINFO_LIVE_P (stmt_info))
 316             {
 317               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 318               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 319
 320               if (dump_enabled_p ())
 321                 dump_printf_loc (MSG_NOTE, vect_location,
 322                                  "get vectype for scalar type:  %T\n",
 323                                  scalar_type);
 324
 325               vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 326               if (!vectype)
 327                 return opt_result::failure_at (phi,
 328                                                "not vectorized: unsupported "
 329                                                "data-type %T\n",
 330                                                scalar_type);
 331               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 332
 333               if (dump_enabled_p ())
 334                 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
 335                                  vectype);
 336
 337               if (dump_enabled_p ())
 338                 {
 339                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 340                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 341                   dump_printf (MSG_NOTE, "\n");
 342                 }
 343
 344               vect_update_max_nunits (&vectorization_factor, vectype);
 345             }
 346         }
 347
 348       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 349            gsi_next (&si))
 350         {
 351           if (is_gimple_debug (gsi_stmt (si)))
 352             continue;
 353           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 354           opt_result res
 355             = vect_determine_vf_for_stmt (loop_vinfo,
 356                                           stmt_info, &vectorization_factor);
 357           if (!res)
 358             return res;
 359         }
 360     }
 361
 362   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 363   if (dump_enabled_p ())
 364     {
 365       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 366       dump_dec (MSG_NOTE, vectorization_factor);
 367       dump_printf (MSG_NOTE, "\n");
 368     }
 369
 370   if (known_le (vectorization_factor, 1U))
 371     return opt_result::failure_at (vect_location,
 372                                    "not vectorized: unsupported data-type\n");
 373   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 374   return opt_result::success ();
 375 }
 376
 377
 378 /* Function vect_is_simple_iv_evolution.
 379
 380    FORNOW: A simple evolution of an induction variables in the loop is
 381    considered a polynomial evolution.  */
 382
 383 static bool
 384 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 385                              tree * step)
 386 {
 387   tree init_expr;
 388   tree step_expr;
 389   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 390   basic_block bb;
 391
 392   /* When there is no evolution in this loop, the evolution function
 393      is not "simple".  */
 394   if (evolution_part == NULL_TREE)
 395     return false;
 396
 397   /* When the evolution is a polynomial of degree >= 2
 398      the evolution function is not "simple".  */
 399   if (tree_is_chrec (evolution_part))
 400     return false;
 401
 402   step_expr = evolution_part;
 403   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 404
 405   if (dump_enabled_p ())
 406     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
 407                      step_expr, init_expr);
 408
 409   *init = init_expr;
 410   *step = step_expr;
 411
 412   if (TREE_CODE (step_expr) != INTEGER_CST
 413       && (TREE_CODE (step_expr) != SSA_NAME
 414           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 415               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 416           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 417               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 418                   || !flag_associative_math)))
 419       && (TREE_CODE (step_expr) != REAL_CST
 420           || !flag_associative_math))
 421     {
 422       if (dump_enabled_p ())
 423         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 424                          "step unknown.\n");
 425       return false;
 426     }
 427
 428   return true;
 429 }
 430
 431 /* Function vect_is_nonlinear_iv_evolution
 432
 433    Only support nonlinear induction for integer type
 434    1. neg
 435    2. mul by constant
 436    3. lshift/rshift by constant.
 437
 438    For neg induction, return a fake step as integer -1.  */
 439 static bool
 440 vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
 441                                 gphi* loop_phi_node, tree *init, tree *step)
 442 {
 443   tree init_expr, ev_expr, result, op1, op2;
 444   gimple* def;
 445
 446   if (gimple_phi_num_args (loop_phi_node) != 2)
 447     return false;
 448
 449   init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop));
 450   ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop));
 451
 452   /* Support nonlinear induction only for integer type.  */
 453   if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
 454     return false;
 455
 456   *init = init_expr;
 457   result = PHI_RESULT (loop_phi_node);
 458
 459   if (TREE_CODE (ev_expr) != SSA_NAME
 460       || ((def = SSA_NAME_DEF_STMT (ev_expr)), false)
 461       || !is_gimple_assign (def))
 462     return false;
 463
 464   enum tree_code t_code = gimple_assign_rhs_code (def);
 465   switch (t_code)
 466     {
 467     case NEGATE_EXPR:
 468       if (gimple_assign_rhs1 (def) != result)
 469         return false;
 470       *step = build_int_cst (TREE_TYPE (init_expr), -1);
 471       STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
 472       break;
 473
 474     case RSHIFT_EXPR:
 475     case LSHIFT_EXPR:
 476     case MULT_EXPR:
 477       op1 = gimple_assign_rhs1 (def);
 478       op2 = gimple_assign_rhs2 (def);
 479       if (TREE_CODE (op2) != INTEGER_CST
 480           || op1 != result)
 481         return false;
 482       *step = op2;
 483       if (t_code == LSHIFT_EXPR)
 484         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
 485       else if (t_code == RSHIFT_EXPR)
 486         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
 487       /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul.  */
 488       else
 489         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
 490       break;
 491
 492     default:
 493       return false;
 494     }
 495
 496   STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = *init;
 497   STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = *step;
 498
 499   return true;
 500 }
 501
 502 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
 503    what we are assuming is a double reduction.  For example, given
 504    a structure like this:
 505
 506       outer1:
 507         x_1 = PHI <x_4(outer2), ...>;
 508         ...
 509
 510       inner:
 511         x_2 = PHI <x_1(outer1), ...>;
 512         ...
 513         x_3 = ...;
 514         ...
 515
 516       outer2:
 517         x_4 = PHI <x_3(inner)>;
 518         ...
 519
 520    outer loop analysis would treat x_1 as a double reduction phi and
 521    this function would then return true for x_2.  */
 522
 523 static bool
 524 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
 525 {
 526   use_operand_p use_p;
 527   ssa_op_iter op_iter;
 528   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
 529     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
 530       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
 531         return true;
 532   return false;
 533 }
 534
 535 /* Returns true if Phi is a first-order recurrence. A first-order
 536    recurrence is a non-reduction recurrence relation in which the value of
 537    the recurrence in the current loop iteration equals a value defined in
 538    the previous iteration.  */
 539
 540 static bool
 541 vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop,
 542                                    gphi *phi)
 543 {
 544   /* A nested cycle isn't vectorizable as first order recurrence.  */
 545   if (LOOP_VINFO_LOOP (loop_vinfo) != loop)
 546     return false;
 547
 548   /* Ensure the loop latch definition is from within the loop.  */
 549   edge latch = loop_latch_edge (loop);
 550   tree ldef = PHI_ARG_DEF_FROM_EDGE (phi, latch);
 551   if (TREE_CODE (ldef) != SSA_NAME
 552       || SSA_NAME_IS_DEFAULT_DEF (ldef)
 553       || is_a <gphi *> (SSA_NAME_DEF_STMT (ldef))
 554       || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (ldef))))
 555     return false;
 556
 557   tree def = gimple_phi_result (phi);
 558
 559   /* Ensure every use_stmt of the phi node is dominated by the latch
 560      definition.  */
 561   imm_use_iterator imm_iter;
 562   use_operand_p use_p;
 563   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
 564     if (!is_gimple_debug (USE_STMT (use_p))
 565         && (SSA_NAME_DEF_STMT (ldef) == USE_STMT (use_p)
 566             || !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (ldef),
 567                                             USE_STMT (use_p))))
 568       return false;
 569
 570   /* First-order recurrence autovectorization needs shuffle vector.  */
 571   tree scalar_type = TREE_TYPE (def);
 572   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 573   if (!vectype)
 574     return false;
 575
 576   return true;
 577 }
 578
 579 /* Function vect_analyze_scalar_cycles_1.
 580
 581    Examine the cross iteration def-use cycles of scalar variables
 582    in LOOP.  LOOP_VINFO represents the loop that is now being
 583    considered for vectorization (can be LOOP, or an outer-loop
 584    enclosing LOOP).  SLP indicates there will be some subsequent
 585    slp analyses or not.  */
 586
 587 static void
 588 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop,
 589                               bool slp)
 590 {
 591   basic_block bb = loop->header;
 592   tree init, step;
 593   auto_vec<stmt_vec_info, 64> worklist;
 594   gphi_iterator gsi;
 595   bool double_reduc, reduc_chain;
 596
 597   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 598
 599   /* First - identify all inductions.  Reduction detection assumes that all the
 600      inductions have been identified, therefore, this order must not be
 601      changed.  */
 602   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 603     {
 604       gphi *phi = gsi.phi ();
 605       tree access_fn = NULL;
 606       tree def = PHI_RESULT (phi);
 607       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 608
 609       if (dump_enabled_p ())
 610         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
 611                          (gimple *) phi);
 612
 613       /* Skip virtual phi's.  The data dependences that are associated with
 614          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 615       if (virtual_operand_p (def))
 616         continue;
 617
 618       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 619
 620       /* Analyze the evolution function.  */
 621       access_fn = analyze_scalar_evolution (loop, def);
 622       if (access_fn)
 623         {
 624           STRIP_NOPS (access_fn);
 625           if (dump_enabled_p ())
 626             dump_printf_loc (MSG_NOTE, vect_location,
 627                              "Access function of PHI: %T\n", access_fn);
 628           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 629             = initial_condition_in_loop_num (access_fn, loop->num);
 630           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 631             = evolution_part_in_loop_num (access_fn, loop->num);
 632         }
 633
 634       if ((!access_fn
 635            || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
 636            || !vect_is_simple_iv_evolution (loop->num, access_fn,
 637                                             &init, &step)
 638            || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 639                && TREE_CODE (step) != INTEGER_CST))
 640           /* Only handle nonlinear iv for same loop.  */
 641           && (LOOP_VINFO_LOOP (loop_vinfo) != loop
 642               || !vect_is_nonlinear_iv_evolution (loop, stmt_vinfo,
 643                                                   phi, &init, &step)))
 644         {
 645           worklist.safe_push (stmt_vinfo);
 646           continue;
 647         }
 648
 649       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 650                   != NULL_TREE);
 651       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 652
 653       if (dump_enabled_p ())
 654         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 655       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 656     }
 657
 658
 659   /* Second - identify all reductions and nested cycles.  */
 660   while (worklist.length () > 0)
 661     {
 662       stmt_vec_info stmt_vinfo = worklist.pop ();
 663       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
 664       tree def = PHI_RESULT (phi);
 665
 666       if (dump_enabled_p ())
 667         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
 668                          (gimple *) phi);
 669
 670       gcc_assert (!virtual_operand_p (def)
 671                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 672
 673       stmt_vec_info reduc_stmt_info
 674         = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
 675                                     &reduc_chain, slp);
 676       if (reduc_stmt_info)
 677         {
 678           STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
 679           STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
 680           if (double_reduc)
 681             {
 682               if (dump_enabled_p ())
 683                 dump_printf_loc (MSG_NOTE, vect_location,
 684                                  "Detected double reduction.\n");
 685
 686               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 687               STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
 688             }
 689           else
 690             {
 691               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 692                 {
 693                   if (dump_enabled_p ())
 694                     dump_printf_loc (MSG_NOTE, vect_location,
 695                                      "Detected vectorizable nested cycle.\n");
 696
 697                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 698                 }
 699               else
 700                 {
 701                   if (dump_enabled_p ())
 702                     dump_printf_loc (MSG_NOTE, vect_location,
 703                                      "Detected reduction.\n");
 704
 705                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 706                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
 707                   /* Store the reduction cycles for possible vectorization in
 708                      loop-aware SLP if it was not detected as reduction
 709                      chain.  */
 710                   if (! reduc_chain)
 711                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
 712                       (reduc_stmt_info);
 713                 }
 714             }
 715         }
 716       else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi))
 717         STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence;
 718       else
 719         if (dump_enabled_p ())
 720           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 721                            "Unknown def-use cycle pattern.\n");
 722     }
 723 }
 724
 725
 726 /* Function vect_analyze_scalar_cycles.
 727
 728    Examine the cross iteration def-use cycles of scalar variables, by
 729    analyzing the loop-header PHIs of scalar variables.  Classify each
 730    cycle as one of the following: invariant, induction, reduction, unknown.
 731    We do that for the loop represented by LOOP_VINFO, and also to its
 732    inner-loop, if exists.
 733    Examples for scalar cycles:
 734
 735    Example1: reduction:
 736
 737               loop1:
 738               for (i=0; i<N; i++)
 739                  sum += a[i];
 740
 741    Example2: induction:
 742
 743               loop2:
 744               for (i=0; i<N; i++)
 745                  a[i] = i;  */
 746
 747 static void
 748 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo, bool slp)
 749 {
 750   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 751
 752   vect_analyze_scalar_cycles_1 (loop_vinfo, loop, slp);
 753
 754   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 755      Reductions in such inner-loop therefore have different properties than
 756      the reductions in the nest that gets vectorized:
 757      1. When vectorized, they are executed in the same order as in the original
 758         scalar loop, so we can't change the order of computation when
 759         vectorizing them.
 760      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 761         current checks are too strict.  */
 762
 763   if (loop->inner)
 764     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner, slp);
 765 }
 766
 767 /* Transfer group and reduction information from STMT_INFO to its
 768    pattern stmt.  */
 769
 770 static void
 771 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
 772 {
 773   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
 774   stmt_vec_info stmtp;
 775   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
 776               && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
 777   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
 778   do
 779     {
 780       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
 781       gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
 782                            == STMT_VINFO_DEF_TYPE (stmt_info));
 783       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
 784       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
 785       if (stmt_info)
 786         REDUC_GROUP_NEXT_ELEMENT (stmtp)
 787           = STMT_VINFO_RELATED_STMT (stmt_info);
 788     }
 789   while (stmt_info);
 790 }
 791
 792 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 793
 794 static void
 795 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 796 {
 797   stmt_vec_info first;
 798   unsigned i;
 799
 800   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 801     {
 802       stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
 803       while (next)
 804         {
 805           if ((STMT_VINFO_IN_PATTERN_P (next)
 806                != STMT_VINFO_IN_PATTERN_P (first))
 807               || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
 808             break;
 809           next = REDUC_GROUP_NEXT_ELEMENT (next);
 810         }
 811       /* If all reduction chain members are well-formed patterns adjust
 812          the group to group the pattern stmts instead.  */
 813       if (! next
 814           && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
 815         {
 816           if (STMT_VINFO_IN_PATTERN_P (first))
 817             {
 818               vect_fixup_reduc_chain (first);
 819               LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 820                 = STMT_VINFO_RELATED_STMT (first);
 821             }
 822         }
 823       /* If not all stmt in the chain are patterns or if we failed
 824          to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
 825          it as regular reduction instead.  */
 826       else
 827         {
 828           stmt_vec_info vinfo = first;
 829           stmt_vec_info last = NULL;
 830           while (vinfo)
 831             {
 832               next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
 833               REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
 834               REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
 835               last = vinfo;
 836               vinfo = next;
 837             }
 838           STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
 839             = vect_internal_def;
 840           loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
 841           LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
 842           --i;
 843         }
 844     }
 845 }
 846
 847 /* Function vect_get_loop_niters.
 848
 849    Determine how many iterations the loop is executed and place it
 850    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 851    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 852    niter information holds in ASSUMPTIONS.
 853
 854    Return the loop exit conditions.  */
 855
 856
 857 static vec<gcond *>
 858 vect_get_loop_niters (class loop *loop, const_edge main_exit, tree *assumptions,
 859                       tree *number_of_iterations, tree *number_of_iterationsm1)
 860 {
 861   auto_vec<edge> exits = get_loop_exit_edges (loop);
 862   vec<gcond *> conds;
 863   conds.create (exits.length ());
 864   class tree_niter_desc niter_desc;
 865   tree niter_assumptions, niter, may_be_zero;
 866
 867   *assumptions = boolean_true_node;
 868   *number_of_iterationsm1 = chrec_dont_know;
 869   *number_of_iterations = chrec_dont_know;
 870
 871   DUMP_VECT_SCOPE ("get_loop_niters");
 872
 873   if (exits.is_empty ())
 874     return conds;
 875
 876   if (dump_enabled_p ())
 877     dump_printf_loc (MSG_NOTE, vect_location, "Loop has %d exits.\n",
 878                      exits.length ());
 879
 880   edge exit;
 881   unsigned int i;
 882   FOR_EACH_VEC_ELT (exits, i, exit)
 883     {
 884       gcond *cond = get_loop_exit_condition (exit);
 885       if (cond)
 886         conds.safe_push (cond);
 887
 888       if (dump_enabled_p ())
 889         dump_printf_loc (MSG_NOTE, vect_location, "Analyzing exit %d...\n", i);
 890
 891       if (exit != main_exit)
 892         continue;
 893
 894       may_be_zero = NULL_TREE;
 895       if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 896           || chrec_contains_undetermined (niter_desc.niter))
 897         continue;
 898
 899       niter_assumptions = niter_desc.assumptions;
 900       may_be_zero = niter_desc.may_be_zero;
 901       niter = niter_desc.niter;
 902
 903       if (may_be_zero && integer_zerop (may_be_zero))
 904         may_be_zero = NULL_TREE;
 905
 906       if (may_be_zero)
 907         {
 908           if (COMPARISON_CLASS_P (may_be_zero))
 909             {
 910               /* Try to combine may_be_zero with assumptions, this can simplify
 911                  computation of niter expression.  */
 912               if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 913                 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 914                                                  niter_assumptions,
 915                                                  fold_build1 (TRUTH_NOT_EXPR,
 916                                                               boolean_type_node,
 917                                                               may_be_zero));
 918               else
 919                 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 920                                      build_int_cst (TREE_TYPE (niter), 0),
 921                                      rewrite_to_non_trapping_overflow (niter));
 922
 923               may_be_zero = NULL_TREE;
 924             }
 925           else if (integer_nonzerop (may_be_zero))
 926             {
 927               *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 928               *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 929               continue;
 930             }
 931           else
 932             continue;
 933        }
 934
 935       /* Loop assumptions are based off the normal exit.  */
 936       *assumptions = niter_assumptions;
 937       *number_of_iterationsm1 = niter;
 938
 939       /* We want the number of loop header executions which is the number
 940          of latch executions plus one.
 941          ???  For UINT_MAX latch executions this number overflows to zero
 942          for loops like do { n++; } while (n != 0);  */
 943       if (niter && !chrec_contains_undetermined (niter))
 944           niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter),
 945                                unshare_expr (niter),
 946                                build_int_cst (TREE_TYPE (niter), 1));
 947       *number_of_iterations = niter;
 948     }
 949
 950   if (dump_enabled_p ())
 951     dump_printf_loc (MSG_NOTE, vect_location, "All loop exits successfully analyzed.\n");
 952
 953   return conds;
 954 }
 955
 956 /*  Determine the main loop exit for the vectorizer.  */
 957
 958 edge
 959 vec_init_loop_exit_info (class loop *loop)
 960 {
 961   /* Before we begin we must first determine which exit is the main one and
 962      which are auxilary exits.  */
 963   auto_vec<edge> exits = get_loop_exit_edges (loop);
 964   if (exits.length () == 1)
 965     return exits[0];
 966
 967   /* If we have multiple exits we only support counting IV at the moment.  Analyze
 968      all exits and return one */
 969   class tree_niter_desc niter_desc;
 970   edge candidate = NULL;
 971   for (edge exit : exits)
 972     {
 973       if (!get_loop_exit_condition (exit))
 974         continue;
 975
 976       if (number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 977           && !chrec_contains_undetermined (niter_desc.niter))
 978         {
 979           if (!niter_desc.may_be_zero || !candidate)
 980             candidate = exit;
 981         }
 982     }
 983
 984   return candidate;
 985 }
 986
 987 /* Function bb_in_loop_p
 988
 989    Used as predicate for dfs order traversal of the loop bbs.  */
 990
 991 static bool
 992 bb_in_loop_p (const_basic_block bb, const void *data)
 993 {
 994   const class loop *const loop = (const class loop *)data;
 995   if (flow_bb_inside_loop_p (loop, bb))
 996     return true;
 997   return false;
 998 }
 999
1000
1001 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1002    stmt_vec_info structs for all the stmts in LOOP_IN.  */
1003
1004 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
1005   : vec_info (vec_info::loop, shared),
1006     loop (loop_in),
1007     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
1008     num_itersm1 (NULL_TREE),
1009     num_iters (NULL_TREE),
1010     num_iters_unchanged (NULL_TREE),
1011     num_iters_assumptions (NULL_TREE),
1012     vector_costs (nullptr),
1013     scalar_costs (nullptr),
1014     th (0),
1015     versioning_threshold (0),
1016     vectorization_factor (0),
1017     main_loop_edge (nullptr),
1018     skip_main_loop_edge (nullptr),
1019     skip_this_loop_edge (nullptr),
1020     reusable_accumulators (),
1021     suggested_unroll_factor (1),
1022     max_vectorization_factor (0),
1023     mask_skip_niters (NULL_TREE),
1024     rgroup_compare_type (NULL_TREE),
1025     simd_if_cond (NULL_TREE),
1026     partial_vector_style (vect_partial_vectors_none),
1027     unaligned_dr (NULL),
1028     peeling_for_alignment (0),
1029     ptr_mask (0),
1030     ivexpr_map (NULL),
1031     scan_map (NULL),
1032     slp_unrolling_factor (1),
1033     inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
1034     vectorizable (false),
1035     can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
1036     using_partial_vectors_p (false),
1037     using_decrementing_iv_p (false),
1038     using_select_vl_p (false),
1039     epil_using_partial_vectors_p (false),
1040     partial_load_store_bias (0),
1041     peeling_for_gaps (false),
1042     peeling_for_niter (false),
1043     no_data_dependencies (false),
1044     has_mask_store (false),
1045     scalar_loop_scaling (profile_probability::uninitialized ()),
1046     scalar_loop (NULL),
1047     orig_loop_info (NULL),
1048     vec_loop_iv_exit (NULL),
1049     vec_epilogue_loop_iv_exit (NULL),
1050     scalar_loop_iv_exit (NULL)
1051 {
1052   /* CHECKME: We want to visit all BBs before their successors (except for
1053      latch blocks, for which this assertion wouldn't hold).  In the simple
1054      case of the loop forms we allow, a dfs order of the BBs would the same
1055      as reversed postorder traversal, so we are safe.  */
1056
1057   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1058                                           bbs, loop->num_nodes, loop);
1059   gcc_assert (nbbs == loop->num_nodes);
1060
1061   for (unsigned int i = 0; i < nbbs; i++)
1062     {
1063       basic_block bb = bbs[i];
1064       gimple_stmt_iterator si;
1065
1066       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1067         {
1068           gimple *phi = gsi_stmt (si);
1069           gimple_set_uid (phi, 0);
1070           add_stmt (phi);
1071         }
1072
1073       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1074         {
1075           gimple *stmt = gsi_stmt (si);
1076           gimple_set_uid (stmt, 0);
1077           if (is_gimple_debug (stmt))
1078             continue;
1079           add_stmt (stmt);
1080           /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
1081              third argument is the #pragma omp simd if (x) condition, when 0,
1082              loop shouldn't be vectorized, when non-zero constant, it should
1083              be vectorized normally, otherwise versioned with vectorized loop
1084              done if the condition is non-zero at runtime.  */
1085           if (loop_in->simduid
1086               && is_gimple_call (stmt)
1087               && gimple_call_internal_p (stmt)
1088               && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
1089               && gimple_call_num_args (stmt) >= 3
1090               && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
1091               && (loop_in->simduid
1092                   == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
1093             {
1094               tree arg = gimple_call_arg (stmt, 2);
1095               if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
1096                 simd_if_cond = arg;
1097               else
1098                 gcc_assert (integer_nonzerop (arg));
1099             }
1100         }
1101     }
1102
1103   epilogue_vinfos.create (6);
1104 }
1105
1106 /* Free all levels of rgroup CONTROLS.  */
1107
1108 void
1109 release_vec_loop_controls (vec<rgroup_controls> *controls)
1110 {
1111   rgroup_controls *rgc;
1112   unsigned int i;
1113   FOR_EACH_VEC_ELT (*controls, i, rgc)
1114     rgc->controls.release ();
1115   controls->release ();
1116 }
1117
1118 /* Free all memory used by the _loop_vec_info, as well as all the
1119    stmt_vec_info structs of all the stmts in the loop.  */
1120
1121 _loop_vec_info::~_loop_vec_info ()
1122 {
1123   free (bbs);
1124
1125   release_vec_loop_controls (&masks.rgc_vec);
1126   release_vec_loop_controls (&lens);
1127   delete ivexpr_map;
1128   delete scan_map;
1129   epilogue_vinfos.release ();
1130   delete scalar_costs;
1131   delete vector_costs;
1132
1133   /* When we release an epiloge vinfo that we do not intend to use
1134      avoid clearing AUX of the main loop which should continue to
1135      point to the main loop vinfo since otherwise we'll leak that.  */
1136   if (loop->aux == this)
1137     loop->aux = NULL;
1138 }
1139
1140 /* Return an invariant or register for EXPR and emit necessary
1141    computations in the LOOP_VINFO loop preheader.  */
1142
1143 tree
1144 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
1145 {
1146   if (is_gimple_reg (expr)
1147       || is_gimple_min_invariant (expr))
1148     return expr;
1149
1150   if (! loop_vinfo->ivexpr_map)
1151     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
1152   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
1153   if (! cached)
1154     {
1155       gimple_seq stmts = NULL;
1156       cached = force_gimple_operand (unshare_expr (expr),
1157                                      &stmts, true, NULL_TREE);
1158       if (stmts)
1159         {
1160           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1161           gsi_insert_seq_on_edge_immediate (e, stmts);
1162         }
1163     }
1164   return cached;
1165 }
1166
1167 /* Return true if we can use CMP_TYPE as the comparison type to produce
1168    all masks required to mask LOOP_VINFO.  */
1169
1170 static bool
1171 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1172 {
1173   rgroup_controls *rgm;
1174   unsigned int i;
1175   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1176     if (rgm->type != NULL_TREE
1177         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1178                                             cmp_type, rgm->type,
1179                                             OPTIMIZE_FOR_SPEED))
1180       return false;
1181   return true;
1182 }
1183
1184 /* Calculate the maximum number of scalars per iteration for every
1185    rgroup in LOOP_VINFO.  */
1186
1187 static unsigned int
1188 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1189 {
1190   unsigned int res = 1;
1191   unsigned int i;
1192   rgroup_controls *rgm;
1193   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1194     res = MAX (res, rgm->max_nscalars_per_iter);
1195   return res;
1196 }
1197
1198 /* Calculate the minimum precision necessary to represent:
1199
1200       MAX_NITERS * FACTOR
1201
1202    as an unsigned integer, where MAX_NITERS is the maximum number of
1203    loop header iterations for the original scalar form of LOOP_VINFO.  */
1204
1205 static unsigned
1206 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1207 {
1208   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1209
1210   /* Get the maximum number of iterations that is representable
1211      in the counter type.  */
1212   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1213   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1214
1215   /* Get a more refined estimate for the number of iterations.  */
1216   widest_int max_back_edges;
1217   if (max_loop_iterations (loop, &max_back_edges))
1218     max_ni = wi::smin (max_ni, max_back_edges + 1);
1219
1220   /* Work out how many bits we need to represent the limit.  */
1221   return wi::min_precision (max_ni * factor, UNSIGNED);
1222 }
1223
1224 /* True if the loop needs peeling or partial vectors when vectorized.  */
1225
1226 static bool
1227 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1228 {
1229   unsigned HOST_WIDE_INT const_vf;
1230   HOST_WIDE_INT max_niter
1231     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1232
1233   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1234   if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1235     th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1236                                           (loop_vinfo));
1237
1238   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1239       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1240     {
1241       /* Work out the (constant) number of iterations that need to be
1242          peeled for reasons other than niters.  */
1243       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1244       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1245         peel_niter += 1;
1246       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1247                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1248         return true;
1249     }
1250   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1251       /* ??? When peeling for gaps but not alignment, we could
1252          try to check whether the (variable) niters is known to be
1253          VF * N + 1.  That's something of a niche case though.  */
1254       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1255       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1256       || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1257            < (unsigned) exact_log2 (const_vf))
1258           /* In case of versioning, check if the maximum number of
1259              iterations is greater than th.  If they are identical,
1260              the epilogue is unnecessary.  */
1261           && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1262               || ((unsigned HOST_WIDE_INT) max_niter
1263                   > (th / const_vf) * const_vf))))
1264     return true;
1265
1266   return false;
1267 }
1268
1269 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1270    whether we can actually generate the masks required.  Return true if so,
1271    storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE.  */
1272
1273 static bool
1274 vect_verify_full_masking (loop_vec_info loop_vinfo)
1275 {
1276   unsigned int min_ni_width;
1277
1278   /* Use a normal loop if there are no statements that need masking.
1279      This only happens in rare degenerate cases: it means that the loop
1280      has no loads, no stores, and no live-out values.  */
1281   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1282     return false;
1283
1284   /* Produce the rgroup controls.  */
1285   for (auto mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1286     {
1287       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1288       tree vectype = mask.first;
1289       unsigned nvectors = mask.second;
1290
1291       if (masks->rgc_vec.length () < nvectors)
1292         masks->rgc_vec.safe_grow_cleared (nvectors, true);
1293       rgroup_controls *rgm = &(*masks).rgc_vec[nvectors - 1];
1294       /* The number of scalars per iteration and the number of vectors are
1295          both compile-time constants.  */
1296       unsigned int nscalars_per_iter
1297           = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1298                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1299
1300       if (rgm->max_nscalars_per_iter < nscalars_per_iter)
1301         {
1302           rgm->max_nscalars_per_iter = nscalars_per_iter;
1303           rgm->type = truth_type_for (vectype);
1304           rgm->factor = 1;
1305         }
1306     }
1307
1308   unsigned int max_nscalars_per_iter
1309     = vect_get_max_nscalars_per_iter (loop_vinfo);
1310
1311   /* Work out how many bits we need to represent the limit.  */
1312   min_ni_width
1313     = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1314
1315   /* Find a scalar mode for which WHILE_ULT is supported.  */
1316   opt_scalar_int_mode cmp_mode_iter;
1317   tree cmp_type = NULL_TREE;
1318   tree iv_type = NULL_TREE;
1319   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1320   unsigned int iv_precision = UINT_MAX;
1321
1322   if (iv_limit != -1)
1323     iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1324                                       UNSIGNED);
1325
1326   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1327     {
1328       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1329       if (cmp_bits >= min_ni_width
1330           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1331         {
1332           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1333           if (this_type
1334               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1335             {
1336               /* Although we could stop as soon as we find a valid mode,
1337                  there are at least two reasons why that's not always the
1338                  best choice:
1339
1340                  - An IV that's Pmode or wider is more likely to be reusable
1341                    in address calculations than an IV that's narrower than
1342                    Pmode.
1343
1344                  - Doing the comparison in IV_PRECISION or wider allows
1345                    a natural 0-based IV, whereas using a narrower comparison
1346                    type requires mitigations against wrap-around.
1347
1348                  Conversely, if the IV limit is variable, doing the comparison
1349                  in a wider type than the original type can introduce
1350                  unnecessary extensions, so picking the widest valid mode
1351                  is not always a good choice either.
1352
1353                  Here we prefer the first IV type that's Pmode or wider,
1354                  and the first comparison type that's IV_PRECISION or wider.
1355                  (The comparison type must be no wider than the IV type,
1356                  to avoid extensions in the vector loop.)
1357
1358                  ??? We might want to try continuing beyond Pmode for ILP32
1359                  targets if CMP_BITS < IV_PRECISION.  */
1360               iv_type = this_type;
1361               if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1362                 cmp_type = this_type;
1363               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1364                 break;
1365             }
1366         }
1367     }
1368
1369   if (!cmp_type)
1370     {
1371       LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.release ();
1372       return false;
1373     }
1374
1375   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1376   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1377   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_while_ult;
1378   return true;
1379 }
1380
1381 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1382    whether we can actually generate AVX512 style masks.  Return true if so,
1383    storing the type of the scalar IV in LOOP_VINFO_RGROUP_IV_TYPE.  */
1384
1385 static bool
1386 vect_verify_full_masking_avx512 (loop_vec_info loop_vinfo)
1387 {
1388   /* Produce differently organized rgc_vec and differently check
1389      we can produce masks.  */
1390
1391   /* Use a normal loop if there are no statements that need masking.
1392      This only happens in rare degenerate cases: it means that the loop
1393      has no loads, no stores, and no live-out values.  */
1394   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1395     return false;
1396
1397   /* For the decrementing IV we need to represent all values in
1398      [0, niter + niter_skip] where niter_skip is the elements we
1399      skip in the first iteration for prologue peeling.  */
1400   tree iv_type = NULL_TREE;
1401   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1402   unsigned int iv_precision = UINT_MAX;
1403   if (iv_limit != -1)
1404     iv_precision = wi::min_precision (iv_limit, UNSIGNED);
1405
1406   /* First compute the type for the IV we use to track the remaining
1407      scalar iterations.  */
1408   opt_scalar_int_mode cmp_mode_iter;
1409   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1410     {
1411       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1412       if (cmp_bits >= iv_precision
1413           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1414         {
1415           iv_type = build_nonstandard_integer_type (cmp_bits, true);
1416           if (iv_type)
1417             break;
1418         }
1419     }
1420   if (!iv_type)
1421     return false;
1422
1423   /* Produce the rgroup controls.  */
1424   for (auto const &mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1425     {
1426       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1427       tree vectype = mask.first;
1428       unsigned nvectors = mask.second;
1429
1430       /* The number of scalars per iteration and the number of vectors are
1431          both compile-time constants.  */
1432       unsigned int nscalars_per_iter
1433         = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1434                      LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1435
1436       /* We index the rgroup_controls vector with nscalars_per_iter
1437          which we keep constant and instead have a varying nvectors,
1438          remembering the vector mask with the fewest nV.  */
1439       if (masks->rgc_vec.length () < nscalars_per_iter)
1440         masks->rgc_vec.safe_grow_cleared (nscalars_per_iter, true);
1441       rgroup_controls *rgm = &(*masks).rgc_vec[nscalars_per_iter - 1];
1442
1443       if (!rgm->type || rgm->factor > nvectors)
1444         {
1445           rgm->type = truth_type_for (vectype);
1446           rgm->compare_type = NULL_TREE;
1447           rgm->max_nscalars_per_iter = nscalars_per_iter;
1448           rgm->factor = nvectors;
1449           rgm->bias_adjusted_ctrl = NULL_TREE;
1450         }
1451     }
1452
1453   /* There is no fixed compare type we are going to use but we have to
1454      be able to get at one for each mask group.  */
1455   unsigned int min_ni_width
1456     = wi::min_precision (vect_max_vf (loop_vinfo), UNSIGNED);
1457
1458   bool ok = true;
1459   for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
1460     {
1461       tree mask_type = rgc.type;
1462       if (!mask_type)
1463         continue;
1464
1465       /* For now vect_get_loop_mask only supports integer mode masks
1466          when we need to split it.  */
1467       if (GET_MODE_CLASS (TYPE_MODE (mask_type)) != MODE_INT
1468           || TYPE_PRECISION (TREE_TYPE (mask_type)) != 1)
1469         {
1470           ok = false;
1471           break;
1472         }
1473
1474       /* If iv_type is usable as compare type use that - we can elide the
1475          saturation in that case.   */
1476       if (TYPE_PRECISION (iv_type) >= min_ni_width)
1477         {
1478           tree cmp_vectype
1479             = build_vector_type (iv_type, TYPE_VECTOR_SUBPARTS (mask_type));
1480           if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1481             rgc.compare_type = cmp_vectype;
1482         }
1483       if (!rgc.compare_type)
1484         FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1485           {
1486             unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1487             if (cmp_bits >= min_ni_width
1488                 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1489               {
1490                 tree cmp_type = build_nonstandard_integer_type (cmp_bits, true);
1491                 if (!cmp_type)
1492                   continue;
1493
1494                 /* Check whether we can produce the mask with cmp_type.  */
1495                 tree cmp_vectype
1496                   = build_vector_type (cmp_type, TYPE_VECTOR_SUBPARTS (mask_type));
1497                 if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1498                   {
1499                     rgc.compare_type = cmp_vectype;
1500                     break;
1501                   }
1502               }
1503         }
1504       if (!rgc.compare_type)
1505         {
1506           ok = false;
1507           break;
1508         }
1509     }
1510   if (!ok)
1511     {
1512       release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
1513       return false;
1514     }
1515
1516   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = error_mark_node;
1517   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1518   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_avx512;
1519   return true;
1520 }
1521
1522 /* Check whether we can use vector access with length based on precison
1523    comparison.  So far, to keep it simple, we only allow the case that the
1524    precision of the target supported length is larger than the precision
1525    required by loop niters.  */
1526
1527 static bool
1528 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1529 {
1530   if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1531     return false;
1532
1533   machine_mode len_load_mode, len_store_mode;
1534   if (!get_len_load_store_mode (loop_vinfo->vector_mode, true)
1535          .exists (&len_load_mode))
1536     return false;
1537   if (!get_len_load_store_mode (loop_vinfo->vector_mode, false)
1538          .exists (&len_store_mode))
1539     return false;
1540
1541   signed char partial_load_bias = internal_len_load_store_bias
1542     (IFN_LEN_LOAD, len_load_mode);
1543
1544   signed char partial_store_bias = internal_len_load_store_bias
1545     (IFN_LEN_STORE, len_store_mode);
1546
1547   gcc_assert (partial_load_bias == partial_store_bias);
1548
1549   if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1550     return false;
1551
1552   /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1553      len_loads with a length of zero.  In order to avoid that we prohibit
1554      more than one loop length here.  */
1555   if (partial_load_bias == -1
1556       && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1557     return false;
1558
1559   LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1560
1561   unsigned int max_nitems_per_iter = 1;
1562   unsigned int i;
1563   rgroup_controls *rgl;
1564   /* Find the maximum number of items per iteration for every rgroup.  */
1565   FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1566     {
1567       unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1568       max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1569     }
1570
1571   /* Work out how many bits we need to represent the length limit.  */
1572   unsigned int min_ni_prec
1573     = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1574
1575   /* Now use the maximum of below precisions for one suitable IV type:
1576      - the IV's natural precision
1577      - the precision needed to hold: the maximum number of scalar
1578        iterations multiplied by the scale factor (min_ni_prec above)
1579      - the Pmode precision
1580
1581      If min_ni_prec is less than the precision of the current niters,
1582      we perfer to still use the niters type.  Prefer to use Pmode and
1583      wider IV to avoid narrow conversions.  */
1584
1585   unsigned int ni_prec
1586     = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1587   min_ni_prec = MAX (min_ni_prec, ni_prec);
1588   min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1589
1590   tree iv_type = NULL_TREE;
1591   opt_scalar_int_mode tmode_iter;
1592   FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1593     {
1594       scalar_mode tmode = tmode_iter.require ();
1595       unsigned int tbits = GET_MODE_BITSIZE (tmode);
1596
1597       /* ??? Do we really want to construct one IV whose precision exceeds
1598          BITS_PER_WORD?  */
1599       if (tbits > BITS_PER_WORD)
1600         break;
1601
1602       /* Find the first available standard integral type.  */
1603       if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1604         {
1605           iv_type = build_nonstandard_integer_type (tbits, true);
1606           break;
1607         }
1608     }
1609
1610   if (!iv_type)
1611     {
1612       if (dump_enabled_p ())
1613         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1614                          "can't vectorize with length-based partial vectors"
1615                          " because there is no suitable iv type.\n");
1616       return false;
1617     }
1618
1619   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1620   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1621   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_len;
1622
1623   return true;
1624 }
1625
1626 /* Calculate the cost of one scalar iteration of the loop.  */
1627 static void
1628 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1629 {
1630   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1631   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1632   int nbbs = loop->num_nodes, factor;
1633   int innerloop_iters, i;
1634
1635   DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1636
1637   /* Gather costs for statements in the scalar loop.  */
1638
1639   /* FORNOW.  */
1640   innerloop_iters = 1;
1641   if (loop->inner)
1642     innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1643
1644   for (i = 0; i < nbbs; i++)
1645     {
1646       gimple_stmt_iterator si;
1647       basic_block bb = bbs[i];
1648
1649       if (bb->loop_father == loop->inner)
1650         factor = innerloop_iters;
1651       else
1652         factor = 1;
1653
1654       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1655         {
1656           gimple *stmt = gsi_stmt (si);
1657           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1658
1659           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1660             continue;
1661
1662           /* Skip stmts that are not vectorized inside the loop.  */
1663           stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1664           if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1665               && (!STMT_VINFO_LIVE_P (vstmt_info)
1666                   || !VECTORIZABLE_CYCLE_DEF
1667                         (STMT_VINFO_DEF_TYPE (vstmt_info))))
1668             continue;
1669
1670           vect_cost_for_stmt kind;
1671           if (STMT_VINFO_DATA_REF (stmt_info))
1672             {
1673               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1674                kind = scalar_load;
1675              else
1676                kind = scalar_store;
1677             }
1678           else if (vect_nop_conversion_p (stmt_info))
1679             continue;
1680           else
1681             kind = scalar_stmt;
1682
1683           /* We are using vect_prologue here to avoid scaling twice
1684              by the inner loop factor.  */
1685           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1686                             factor, kind, stmt_info, 0, vect_prologue);
1687         }
1688     }
1689
1690   /* Now accumulate cost.  */
1691   loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1692   add_stmt_costs (loop_vinfo->scalar_costs,
1693                   &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1694   loop_vinfo->scalar_costs->finish_cost (nullptr);
1695 }
1696
1697
1698 /* Function vect_analyze_loop_form.
1699
1700    Verify that certain CFG restrictions hold, including:
1701    - the loop has a pre-header
1702    - the loop has a single entry and exit
1703    - the loop exit condition is simple enough
1704    - the number of iterations can be analyzed, i.e, a countable loop.  The
1705      niter could be analyzed under some assumptions.  */
1706
1707 opt_result
1708 vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info)
1709 {
1710   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1711
1712   edge exit_e = vec_init_loop_exit_info (loop);
1713   if (!exit_e)
1714     return opt_result::failure_at (vect_location,
1715                                    "not vectorized:"
1716                                    " could not determine main exit from"
1717                                    " loop with multiple exits.\n");
1718   info->loop_exit = exit_e;
1719   if (dump_enabled_p ())
1720       dump_printf_loc (MSG_NOTE, vect_location,
1721                        "using as main loop exit: %d -> %d [AUX: %p]\n",
1722                        exit_e->src->index, exit_e->dest->index, exit_e->aux);
1723
1724   /* Different restrictions apply when we are considering an inner-most loop,
1725      vs. an outer (nested) loop.
1726      (FORNOW. May want to relax some of these restrictions in the future).  */
1727
1728   info->inner_loop_cond = NULL;
1729   if (!loop->inner)
1730     {
1731       /* Inner-most loop.  We currently require that the number of BBs is
1732          exactly 2 (the header and latch).  Vectorizable inner-most loops
1733          look like this:
1734
1735                         (pre-header)
1736                            |
1737                           header <--------+
1738                            | |            |
1739                            | +--> latch --+
1740                            |
1741                         (exit-bb)  */
1742
1743       if (loop->num_nodes != 2)
1744         return opt_result::failure_at (vect_location,
1745                                        "not vectorized:"
1746                                        " control flow in loop.\n");
1747
1748       if (empty_block_p (loop->header))
1749         return opt_result::failure_at (vect_location,
1750                                        "not vectorized: empty loop.\n");
1751     }
1752   else
1753     {
1754       class loop *innerloop = loop->inner;
1755       edge entryedge;
1756
1757       /* Nested loop. We currently require that the loop is doubly-nested,
1758          contains a single inner loop, and the number of BBs is exactly 5.
1759          Vectorizable outer-loops look like this:
1760
1761                         (pre-header)
1762                            |
1763                           header <---+
1764                            |         |
1765                           inner-loop |
1766                            |         |
1767                           tail ------+
1768                            |
1769                         (exit-bb)
1770
1771          The inner-loop has the properties expected of inner-most loops
1772          as described above.  */
1773
1774       if ((loop->inner)->inner || (loop->inner)->next)
1775         return opt_result::failure_at (vect_location,
1776                                        "not vectorized:"
1777                                        " multiple nested loops.\n");
1778
1779       if (loop->num_nodes != 5)
1780         return opt_result::failure_at (vect_location,
1781                                        "not vectorized:"
1782                                        " control flow in loop.\n");
1783
1784       entryedge = loop_preheader_edge (innerloop);
1785       if (entryedge->src != loop->header
1786           || !single_exit (innerloop)
1787           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1788         return opt_result::failure_at (vect_location,
1789                                        "not vectorized:"
1790                                        " unsupported outerloop form.\n");
1791
1792       /* Analyze the inner-loop.  */
1793       vect_loop_form_info inner;
1794       opt_result res = vect_analyze_loop_form (loop->inner, &inner);
1795       if (!res)
1796         {
1797           if (dump_enabled_p ())
1798             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1799                              "not vectorized: Bad inner loop.\n");
1800           return res;
1801         }
1802
1803       /* Don't support analyzing niter under assumptions for inner
1804          loop.  */
1805       if (!integer_onep (inner.assumptions))
1806         return opt_result::failure_at (vect_location,
1807                                        "not vectorized: Bad inner loop.\n");
1808
1809       if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1810         return opt_result::failure_at (vect_location,
1811                                        "not vectorized: inner-loop count not"
1812                                        " invariant.\n");
1813
1814       if (dump_enabled_p ())
1815         dump_printf_loc (MSG_NOTE, vect_location,
1816                          "Considering outer-loop vectorization.\n");
1817       info->inner_loop_cond = inner.conds[0];
1818     }
1819
1820   if (!single_exit (loop))
1821     return opt_result::failure_at (vect_location,
1822                                    "not vectorized: multiple exits.\n");
1823   if (EDGE_COUNT (loop->header->preds) != 2)
1824     return opt_result::failure_at (vect_location,
1825                                    "not vectorized:"
1826                                    " too many incoming edges.\n");
1827
1828   /* We assume that the loop exit condition is at the end of the loop. i.e,
1829      that the loop is represented as a do-while (with a proper if-guard
1830      before the loop if needed), where the loop header contains all the
1831      executable statements, and the latch is empty.  */
1832   if (!empty_block_p (loop->latch)
1833       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1834     return opt_result::failure_at (vect_location,
1835                                    "not vectorized: latch block not empty.\n");
1836
1837   /* Make sure the exit is not abnormal.  */
1838   if (exit_e->flags & EDGE_ABNORMAL)
1839     return opt_result::failure_at (vect_location,
1840                                    "not vectorized:"
1841                                    " abnormal loop exit edge.\n");
1842
1843   info->conds
1844     = vect_get_loop_niters (loop, exit_e, &info->assumptions,
1845                             &info->number_of_iterations,
1846                             &info->number_of_iterationsm1);
1847
1848   if (info->conds.is_empty ())
1849     return opt_result::failure_at
1850       (vect_location,
1851        "not vectorized: complicated exit condition.\n");
1852
1853   /* Determine what the primary and alternate exit conds are.  */
1854   for (unsigned i = 0; i < info->conds.length (); i++)
1855     {
1856       gcond *cond = info->conds[i];
1857       if (exit_e->src == gimple_bb (cond))
1858         std::swap (info->conds[0], info->conds[i]);
1859     }
1860
1861   if (integer_zerop (info->assumptions)
1862       || !info->number_of_iterations
1863       || chrec_contains_undetermined (info->number_of_iterations))
1864     return opt_result::failure_at
1865       (info->conds[0],
1866        "not vectorized: number of iterations cannot be computed.\n");
1867
1868   if (integer_zerop (info->number_of_iterations))
1869     return opt_result::failure_at
1870       (info->conds[0],
1871        "not vectorized: number of iterations = 0.\n");
1872
1873   if (!(tree_fits_shwi_p (info->number_of_iterations)
1874         && tree_to_shwi (info->number_of_iterations) > 0))
1875     {
1876       if (dump_enabled_p ())
1877         {
1878           dump_printf_loc (MSG_NOTE, vect_location,
1879                            "Symbolic number of iterations is ");
1880           dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1881           dump_printf (MSG_NOTE, "\n");
1882         }
1883     }
1884
1885   return opt_result::success ();
1886 }
1887
1888 /* Create a loop_vec_info for LOOP with SHARED and the
1889    vect_analyze_loop_form result.  */
1890
1891 loop_vec_info
1892 vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1893                         const vect_loop_form_info *info,
1894                         loop_vec_info main_loop_info)
1895 {
1896   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1897   LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1898   LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1899   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1900   LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = main_loop_info;
1901   /* Also record the assumptions for versioning.  */
1902   if (!integer_onep (info->assumptions) && !main_loop_info)
1903     LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1904
1905   for (gcond *cond : info->conds)
1906     {
1907       stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (cond);
1908       STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1909     }
1910
1911   for (unsigned i = 1; i < info->conds.length (); i ++)
1912     LOOP_VINFO_LOOP_CONDS (loop_vinfo).safe_push (info->conds[i]);
1913   LOOP_VINFO_LOOP_IV_COND (loop_vinfo) = info->conds[0];
1914
1915   LOOP_VINFO_IV_EXIT (loop_vinfo) = info->loop_exit;
1916
1917   if (info->inner_loop_cond)
1918     {
1919       stmt_vec_info inner_loop_cond_info
1920         = loop_vinfo->lookup_stmt (info->inner_loop_cond);
1921       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1922       /* If we have an estimate on the number of iterations of the inner
1923          loop use that to limit the scale for costing, otherwise use
1924          --param vect-inner-loop-cost-factor literally.  */
1925       widest_int nit;
1926       if (estimated_stmt_executions (loop->inner, &nit))
1927         LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1928           = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1929     }
1930
1931   return loop_vinfo;
1932 }
1933
1934
1935
1936 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1937    statements update the vectorization factor.  */
1938
1939 static void
1940 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1941 {
1942   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1943   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1944   int nbbs = loop->num_nodes;
1945   poly_uint64 vectorization_factor;
1946   int i;
1947
1948   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1949
1950   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1951   gcc_assert (known_ne (vectorization_factor, 0U));
1952
1953   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1954      vectorization factor of the loop is the unrolling factor required by
1955      the SLP instances.  If that unrolling factor is 1, we say, that we
1956      perform pure SLP on loop - cross iteration parallelism is not
1957      exploited.  */
1958   bool only_slp_in_loop = true;
1959   for (i = 0; i < nbbs; i++)
1960     {
1961       basic_block bb = bbs[i];
1962       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1963            gsi_next (&si))
1964         {
1965           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1966           if (!stmt_info)
1967             continue;
1968           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1969                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1970               && !PURE_SLP_STMT (stmt_info))
1971             /* STMT needs both SLP and loop-based vectorization.  */
1972             only_slp_in_loop = false;
1973         }
1974       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1975            gsi_next (&si))
1976         {
1977           if (is_gimple_debug (gsi_stmt (si)))
1978             continue;
1979           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1980           stmt_info = vect_stmt_to_vectorize (stmt_info);
1981           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1982                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1983               && !PURE_SLP_STMT (stmt_info))
1984             /* STMT needs both SLP and loop-based vectorization.  */
1985             only_slp_in_loop = false;
1986         }
1987     }
1988
1989   if (only_slp_in_loop)
1990     {
1991       if (dump_enabled_p ())
1992         dump_printf_loc (MSG_NOTE, vect_location,
1993                          "Loop contains only SLP stmts\n");
1994       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1995     }
1996   else
1997     {
1998       if (dump_enabled_p ())
1999         dump_printf_loc (MSG_NOTE, vect_location,
2000                          "Loop contains SLP and non-SLP stmts\n");
2001       /* Both the vectorization factor and unroll factor have the form
2002          GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
2003          so they must have a common multiple.  */
2004       vectorization_factor
2005         = force_common_multiple (vectorization_factor,
2006                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
2007     }
2008
2009   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
2010   if (dump_enabled_p ())
2011     {
2012       dump_printf_loc (MSG_NOTE, vect_location,
2013                        "Updating vectorization factor to ");
2014       dump_dec (MSG_NOTE, vectorization_factor);
2015       dump_printf (MSG_NOTE, ".\n");
2016     }
2017 }
2018
2019 /* Return true if STMT_INFO describes a double reduction phi and if
2020    the other phi in the reduction is also relevant for vectorization.
2021    This rejects cases such as:
2022
2023       outer1:
2024         x_1 = PHI <x_3(outer2), ...>;
2025         ...
2026
2027       inner:
2028         x_2 = ...;
2029         ...
2030
2031       outer2:
2032         x_3 = PHI <x_2(inner)>;
2033
2034    if nothing in x_2 or elsewhere makes x_1 relevant.  */
2035
2036 static bool
2037 vect_active_double_reduction_p (stmt_vec_info stmt_info)
2038 {
2039   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2040     return false;
2041
2042   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
2043 }
2044
2045 /* Function vect_analyze_loop_operations.
2046
2047    Scan the loop stmts and make sure they are all vectorizable.  */
2048
2049 static opt_result
2050 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
2051 {
2052   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2053   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2054   int nbbs = loop->num_nodes;
2055   int i;
2056   stmt_vec_info stmt_info;
2057   bool need_to_vectorize = false;
2058   bool ok;
2059
2060   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
2061
2062   auto_vec<stmt_info_for_cost> cost_vec;
2063
2064   for (i = 0; i < nbbs; i++)
2065     {
2066       basic_block bb = bbs[i];
2067
2068       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
2069            gsi_next (&si))
2070         {
2071           gphi *phi = si.phi ();
2072           ok = true;
2073
2074           stmt_info = loop_vinfo->lookup_stmt (phi);
2075           if (dump_enabled_p ())
2076             dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G",
2077                              (gimple *) phi);
2078           if (virtual_operand_p (gimple_phi_result (phi)))
2079             continue;
2080
2081           /* Inner-loop loop-closed exit phi in outer-loop vectorization
2082              (i.e., a phi in the tail of the outer-loop).  */
2083           if (! is_loop_header_bb_p (bb))
2084             {
2085               /* FORNOW: we currently don't support the case that these phis
2086                  are not used in the outerloop (unless it is double reduction,
2087                  i.e., this phi is vect_reduction_def), cause this case
2088                  requires to actually do something here.  */
2089               if (STMT_VINFO_LIVE_P (stmt_info)
2090                   && !vect_active_double_reduction_p (stmt_info))
2091                 return opt_result::failure_at (phi,
2092                                                "Unsupported loop-closed phi"
2093                                                " in outer-loop.\n");
2094
2095               /* If PHI is used in the outer loop, we check that its operand
2096                  is defined in the inner loop.  */
2097               if (STMT_VINFO_RELEVANT_P (stmt_info))
2098                 {
2099                   tree phi_op;
2100
2101                   if (gimple_phi_num_args (phi) != 1)
2102                     return opt_result::failure_at (phi, "unsupported phi");
2103
2104                   phi_op = PHI_ARG_DEF (phi, 0);
2105                   stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
2106                   if (!op_def_info)
2107                     return opt_result::failure_at (phi, "unsupported phi\n");
2108
2109                   if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
2110                       && (STMT_VINFO_RELEVANT (op_def_info)
2111                           != vect_used_in_outer_by_reduction))
2112                     return opt_result::failure_at (phi, "unsupported phi\n");
2113
2114                   if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2115                        || (STMT_VINFO_DEF_TYPE (stmt_info)
2116                            == vect_double_reduction_def))
2117                       && !vectorizable_lc_phi (loop_vinfo,
2118                                                stmt_info, NULL, NULL))
2119                     return opt_result::failure_at (phi, "unsupported phi\n");
2120                 }
2121
2122               continue;
2123             }
2124
2125           gcc_assert (stmt_info);
2126
2127           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
2128                || STMT_VINFO_LIVE_P (stmt_info))
2129               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def
2130               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
2131             /* A scalar-dependence cycle that we don't support.  */
2132             return opt_result::failure_at (phi,
2133                                            "not vectorized:"
2134                                            " scalar dependence cycle.\n");
2135
2136           if (STMT_VINFO_RELEVANT_P (stmt_info))
2137             {
2138               need_to_vectorize = true;
2139               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
2140                   && ! PURE_SLP_STMT (stmt_info))
2141                 ok = vectorizable_induction (loop_vinfo,
2142                                              stmt_info, NULL, NULL,
2143                                              &cost_vec);
2144               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2145                         || (STMT_VINFO_DEF_TYPE (stmt_info)
2146                             == vect_double_reduction_def)
2147                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
2148                        && ! PURE_SLP_STMT (stmt_info))
2149                 ok = vectorizable_reduction (loop_vinfo,
2150                                              stmt_info, NULL, NULL, &cost_vec);
2151               else if ((STMT_VINFO_DEF_TYPE (stmt_info)
2152                         == vect_first_order_recurrence)
2153                        && ! PURE_SLP_STMT (stmt_info))
2154                 ok = vectorizable_recurr (loop_vinfo, stmt_info, NULL, NULL,
2155                                            &cost_vec);
2156             }
2157
2158           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
2159           if (ok
2160               && STMT_VINFO_LIVE_P (stmt_info)
2161               && !PURE_SLP_STMT (stmt_info))
2162             ok = vectorizable_live_operation (loop_vinfo, stmt_info, NULL, NULL,
2163                                               -1, false, &cost_vec);
2164
2165           if (!ok)
2166             return opt_result::failure_at (phi,
2167                                            "not vectorized: relevant phi not "
2168                                            "supported: %G",
2169                                            static_cast <gimple *> (phi));
2170         }
2171
2172       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
2173            gsi_next (&si))
2174         {
2175           gimple *stmt = gsi_stmt (si);
2176           if (!gimple_clobber_p (stmt)
2177               && !is_gimple_debug (stmt))
2178             {
2179               opt_result res
2180                 = vect_analyze_stmt (loop_vinfo,
2181                                      loop_vinfo->lookup_stmt (stmt),
2182                                      &need_to_vectorize,
2183                                      NULL, NULL, &cost_vec);
2184               if (!res)
2185                 return res;
2186             }
2187         }
2188     } /* bbs */
2189
2190   add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
2191
2192   /* All operations in the loop are either irrelevant (deal with loop
2193      control, or dead), or only used outside the loop and can be moved
2194      out of the loop (e.g. invariants, inductions).  The loop can be
2195      optimized away by scalar optimizations.  We're better off not
2196      touching this loop.  */
2197   if (!need_to_vectorize)
2198     {
2199       if (dump_enabled_p ())
2200         dump_printf_loc (MSG_NOTE, vect_location,
2201                          "All the computation can be taken out of the loop.\n");
2202       return opt_result::failure_at
2203         (vect_location,
2204          "not vectorized: redundant loop. no profit to vectorize.\n");
2205     }
2206
2207   return opt_result::success ();
2208 }
2209
2210 /* Return true if we know that the iteration count is smaller than the
2211    vectorization factor.  Return false if it isn't, or if we can't be sure
2212    either way.  */
2213
2214 static bool
2215 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
2216 {
2217   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2218
2219   HOST_WIDE_INT max_niter;
2220   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2221     max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
2222   else
2223     max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2224
2225   if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
2226     return true;
2227
2228   return false;
2229 }
2230
2231 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
2232    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
2233    definitely no, or -1 if it's worth retrying.  */
2234
2235 static int
2236 vect_analyze_loop_costing (loop_vec_info loop_vinfo,
2237                            unsigned *suggested_unroll_factor)
2238 {
2239   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2240   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2241
2242   /* Only loops that can handle partially-populated vectors can have iteration
2243      counts less than the vectorization factor.  */
2244   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2245       && vect_known_niters_smaller_than_vf (loop_vinfo))
2246     {
2247       if (dump_enabled_p ())
2248         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2249                          "not vectorized: iteration count smaller than "
2250                          "vectorization factor.\n");
2251       return 0;
2252     }
2253
2254   /* If we know the number of iterations we can do better, for the
2255      epilogue we can also decide whether the main loop leaves us
2256      with enough iterations, prefering a smaller vector epilog then
2257      also possibly used for the case we skip the vector loop.  */
2258   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2259     {
2260       widest_int scalar_niters
2261         = wi::to_widest (LOOP_VINFO_NITERSM1 (loop_vinfo)) + 1;
2262       if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2263         {
2264           loop_vec_info orig_loop_vinfo
2265             = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2266           unsigned lowest_vf
2267             = constant_lower_bound (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo));
2268           int prolog_peeling = 0;
2269           if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2270             prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo);
2271           if (prolog_peeling >= 0
2272               && known_eq (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
2273                            lowest_vf))
2274             {
2275               unsigned gap
2276                 = LOOP_VINFO_PEELING_FOR_GAPS (orig_loop_vinfo) ? 1 : 0;
2277               scalar_niters = ((scalar_niters - gap - prolog_peeling)
2278                                % lowest_vf + gap);
2279             }
2280         }
2281       /* Reject vectorizing for a single scalar iteration, even if
2282          we could in principle implement that using partial vectors.  */
2283       unsigned peeling_gap = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2284       if (scalar_niters <= peeling_gap + 1)
2285         {
2286           if (dump_enabled_p ())
2287             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2288                              "not vectorized: loop only has a single "
2289                              "scalar iteration.\n");
2290           return 0;
2291         }
2292
2293       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2294         {
2295           /* Check that the loop processes at least one full vector.  */
2296           poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2297           if (known_lt (scalar_niters, vf))
2298             {
2299               if (dump_enabled_p ())
2300                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2301                                  "loop does not have enough iterations "
2302                                  "to support vectorization.\n");
2303               return 0;
2304             }
2305
2306           /* If we need to peel an extra epilogue iteration to handle data
2307              accesses with gaps, check that there are enough scalar iterations
2308              available.
2309
2310              The check above is redundant with this one when peeling for gaps,
2311              but the distinction is useful for diagnostics.  */
2312           if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2313               && known_le (scalar_niters, vf))
2314             {
2315               if (dump_enabled_p ())
2316                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2317                                  "loop does not have enough iterations "
2318                                  "to support peeling for gaps.\n");
2319               return 0;
2320             }
2321         }
2322     }
2323
2324   /* If using the "very cheap" model. reject cases in which we'd keep
2325      a copy of the scalar code (even if we might be able to vectorize it).  */
2326   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2327       && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2328           || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2329           || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
2330     {
2331       if (dump_enabled_p ())
2332         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2333                          "some scalar iterations would need to be peeled\n");
2334       return 0;
2335     }
2336
2337   int min_profitable_iters, min_profitable_estimate;
2338   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2339                                       &min_profitable_estimate,
2340                                       suggested_unroll_factor);
2341
2342   if (min_profitable_iters < 0)
2343     {
2344       if (dump_enabled_p ())
2345         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2346                          "not vectorized: vectorization not profitable.\n");
2347       if (dump_enabled_p ())
2348         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2349                          "not vectorized: vector version will never be "
2350                          "profitable.\n");
2351       return -1;
2352     }
2353
2354   int min_scalar_loop_bound = (param_min_vect_loop_bound
2355                                * assumed_vf);
2356
2357   /* Use the cost model only if it is more conservative than user specified
2358      threshold.  */
2359   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
2360                                     min_profitable_iters);
2361
2362   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2363
2364   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2365       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2366     {
2367       if (dump_enabled_p ())
2368         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2369                          "not vectorized: vectorization not profitable.\n");
2370       if (dump_enabled_p ())
2371         dump_printf_loc (MSG_NOTE, vect_location,
2372                          "not vectorized: iteration count smaller than user "
2373                          "specified loop bound parameter or minimum profitable "
2374                          "iterations (whichever is more conservative).\n");
2375       return 0;
2376     }
2377
2378   /* The static profitablity threshold min_profitable_estimate includes
2379      the cost of having to check at runtime whether the scalar loop
2380      should be used instead.  If it turns out that we don't need or want
2381      such a check, the threshold we should use for the static estimate
2382      is simply the point at which the vector loop becomes more profitable
2383      than the scalar loop.  */
2384   if (min_profitable_estimate > min_profitable_iters
2385       && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
2386       && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2387       && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2388       && !vect_apply_runtime_profitability_check_p (loop_vinfo))
2389     {
2390       if (dump_enabled_p ())
2391         dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
2392                          " choice between the scalar and vector loops\n");
2393       min_profitable_estimate = min_profitable_iters;
2394     }
2395
2396   /* If the vector loop needs multiple iterations to be beneficial then
2397      things are probably too close to call, and the conservative thing
2398      would be to stick with the scalar code.  */
2399   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2400       && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
2401     {
2402       if (dump_enabled_p ())
2403         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2404                          "one iteration of the vector loop would be"
2405                          " more expensive than the equivalent number of"
2406                          " iterations of the scalar loop\n");
2407       return 0;
2408     }
2409
2410   HOST_WIDE_INT estimated_niter;
2411
2412   /* If we are vectorizing an epilogue then we know the maximum number of
2413      scalar iterations it will cover is at least one lower than the
2414      vectorization factor of the main loop.  */
2415   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2416     estimated_niter
2417       = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
2418   else
2419     {
2420       estimated_niter = estimated_stmt_executions_int (loop);
2421       if (estimated_niter == -1)
2422         estimated_niter = likely_max_stmt_executions_int (loop);
2423     }
2424   if (estimated_niter != -1
2425       && ((unsigned HOST_WIDE_INT) estimated_niter
2426           < MAX (th, (unsigned) min_profitable_estimate)))
2427     {
2428       if (dump_enabled_p ())
2429         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2430                          "not vectorized: estimated iteration count too "
2431                          "small.\n");
2432       if (dump_enabled_p ())
2433         dump_printf_loc (MSG_NOTE, vect_location,
2434                          "not vectorized: estimated iteration count smaller "
2435                          "than specified loop bound parameter or minimum "
2436                          "profitable iterations (whichever is more "
2437                          "conservative).\n");
2438       return -1;
2439     }
2440
2441   return 1;
2442 }
2443
2444 static opt_result
2445 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
2446                            vec<data_reference_p> *datarefs,
2447                            unsigned int *n_stmts)
2448 {
2449   *n_stmts = 0;
2450   for (unsigned i = 0; i < loop->num_nodes; i++)
2451     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2452          !gsi_end_p (gsi); gsi_next (&gsi))
2453       {
2454         gimple *stmt = gsi_stmt (gsi);
2455         if (is_gimple_debug (stmt))
2456           continue;
2457         ++(*n_stmts);
2458         opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
2459                                                         NULL, 0);
2460         if (!res)
2461           {
2462             if (is_gimple_call (stmt) && loop->safelen)
2463               {
2464                 tree fndecl = gimple_call_fndecl (stmt), op;
2465                 if (fndecl == NULL_TREE
2466                     && gimple_call_internal_p (stmt, IFN_MASK_CALL))
2467                   {
2468                     fndecl = gimple_call_arg (stmt, 0);
2469                     gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
2470                     fndecl = TREE_OPERAND (fndecl, 0);
2471                     gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
2472                   }
2473                 if (fndecl != NULL_TREE)
2474                   {
2475                     cgraph_node *node = cgraph_node::get (fndecl);
2476                     if (node != NULL && node->simd_clones != NULL)
2477                       {
2478                         unsigned int j, n = gimple_call_num_args (stmt);
2479                         for (j = 0; j < n; j++)
2480                           {
2481                             op = gimple_call_arg (stmt, j);
2482                             if (DECL_P (op)
2483                                 || (REFERENCE_CLASS_P (op)
2484                                     && get_base_address (op)))
2485                               break;
2486                           }
2487                         op = gimple_call_lhs (stmt);
2488                         /* Ignore #pragma omp declare simd functions
2489                            if they don't have data references in the
2490                            call stmt itself.  */
2491                         if (j == n
2492                             && !(op
2493                                  && (DECL_P (op)
2494                                      || (REFERENCE_CLASS_P (op)
2495                                          && get_base_address (op)))))
2496                           continue;
2497                       }
2498                   }
2499               }
2500             return res;
2501           }
2502         /* If dependence analysis will give up due to the limit on the
2503            number of datarefs stop here and fail fatally.  */
2504         if (datarefs->length ()
2505             > (unsigned)param_loop_max_datarefs_for_datadeps)
2506           return opt_result::failure_at (stmt, "exceeded param "
2507                                          "loop-max-datarefs-for-datadeps\n");
2508       }
2509   return opt_result::success ();
2510 }
2511
2512 /* Look for SLP-only access groups and turn each individual access into its own
2513    group.  */
2514 static void
2515 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2516 {
2517   unsigned int i;
2518   struct data_reference *dr;
2519
2520   DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2521
2522   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2523   FOR_EACH_VEC_ELT (datarefs, i, dr)
2524     {
2525       gcc_assert (DR_REF (dr));
2526       stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
2527
2528       /* Check if the load is a part of an interleaving chain.  */
2529       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2530         {
2531           stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2532           dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2533           unsigned int group_size = DR_GROUP_SIZE (first_element);
2534
2535           /* Check if SLP-only groups.  */
2536           if (!STMT_SLP_TYPE (stmt_info)
2537               && STMT_VINFO_SLP_VECT_ONLY (first_element))
2538             {
2539               /* Dissolve the group.  */
2540               STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2541
2542               stmt_vec_info vinfo = first_element;
2543               while (vinfo)
2544                 {
2545                   stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2546                   DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2547                   DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2548                   DR_GROUP_SIZE (vinfo) = 1;
2549                   if (STMT_VINFO_STRIDED_P (first_element)
2550                       /* We cannot handle stores with gaps.  */
2551                       || DR_IS_WRITE (dr_info->dr))
2552                     {
2553                       STMT_VINFO_STRIDED_P (vinfo) = true;
2554                       DR_GROUP_GAP (vinfo) = 0;
2555                     }
2556                   else
2557                     DR_GROUP_GAP (vinfo) = group_size - 1;
2558                   /* Duplicate and adjust alignment info, it needs to
2559                      be present on each group leader, see dr_misalignment.  */
2560                   if (vinfo != first_element)
2561                     {
2562                       dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2563                       dr_info2->target_alignment = dr_info->target_alignment;
2564                       int misalignment = dr_info->misalignment;
2565                       if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2566                         {
2567                           HOST_WIDE_INT diff
2568                             = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2569                                - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2570                           unsigned HOST_WIDE_INT align_c
2571                             = dr_info->target_alignment.to_constant ();
2572                           misalignment = (misalignment + diff) % align_c;
2573                         }
2574                       dr_info2->misalignment = misalignment;
2575                     }
2576                   vinfo = next;
2577                 }
2578             }
2579         }
2580     }
2581 }
2582
2583 /* Determine if operating on full vectors for LOOP_VINFO might leave
2584    some scalar iterations still to do.  If so, decide how we should
2585    handle those scalar iterations.  The possibilities are:
2586
2587    (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2588        In this case:
2589
2590          LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2591          LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2592          LOOP_VINFO_PEELING_FOR_NITER == false
2593
2594    (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2595        to handle the remaining scalar iterations.  In this case:
2596
2597          LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2598          LOOP_VINFO_PEELING_FOR_NITER == true
2599
2600        There are two choices:
2601
2602        (2a) Consider vectorizing the epilogue loop at the same VF as the
2603             main loop, but using partial vectors instead of full vectors.
2604             In this case:
2605
2606               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2607
2608        (2b) Consider vectorizing the epilogue loop at lower VFs only.
2609             In this case:
2610
2611               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2612  */
2613
2614 opt_result
2615 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo)
2616 {
2617   /* Determine whether there would be any scalar iterations left over.  */
2618   bool need_peeling_or_partial_vectors_p
2619     = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2620
2621   /* Decide whether to vectorize the loop with partial vectors.  */
2622   LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2623   LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2624   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2625       && need_peeling_or_partial_vectors_p)
2626     {
2627       /* For partial-vector-usage=1, try to push the handling of partial
2628          vectors to the epilogue, with the main loop continuing to operate
2629          on full vectors.
2630
2631          If we are unrolling we also do not want to use partial vectors. This
2632          is to avoid the overhead of generating multiple masks and also to
2633          avoid having to execute entire iterations of FALSE masked instructions
2634          when dealing with one or less full iterations.
2635
2636          ??? We could then end up failing to use partial vectors if we
2637          decide to peel iterations into a prologue, and if the main loop
2638          then ends up processing fewer than VF iterations.  */
2639       if ((param_vect_partial_vector_usage == 1
2640            || loop_vinfo->suggested_unroll_factor > 1)
2641           && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2642           && !vect_known_niters_smaller_than_vf (loop_vinfo))
2643         LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2644       else
2645         LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2646     }
2647
2648   if (dump_enabled_p ())
2649     dump_printf_loc (MSG_NOTE, vect_location,
2650                      "operating on %s vectors%s.\n",
2651                      LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2652                      ? "partial" : "full",
2653                      LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2654                      ? " for epilogue loop" : "");
2655
2656   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2657     = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2658        && need_peeling_or_partial_vectors_p);
2659
2660   return opt_result::success ();
2661 }
2662
2663 /* Function vect_analyze_loop_2.
2664
2665    Apply a set of analyses on LOOP specified by LOOP_VINFO, the different
2666    analyses will record information in some members of LOOP_VINFO.  FATAL
2667    indicates if some analysis meets fatal error.  If one non-NULL pointer
2668    SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one
2669    worked out suggested unroll factor, while one NULL pointer shows it's
2670    going to apply the suggested unroll factor.  SLP_DONE_FOR_SUGGESTED_UF
2671    is to hold the slp decision when the suggested unroll factor is worked
2672    out.  */
2673 static opt_result
2674 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
2675                      unsigned *suggested_unroll_factor,
2676                      bool& slp_done_for_suggested_uf)
2677 {
2678   opt_result ok = opt_result::success ();
2679   int res;
2680   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2681   poly_uint64 min_vf = 2;
2682   loop_vec_info orig_loop_vinfo = NULL;
2683
2684   /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2685      loop_vec_info of the first vectorized loop.  */
2686   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2687     orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2688   else
2689     orig_loop_vinfo = loop_vinfo;
2690   gcc_assert (orig_loop_vinfo);
2691
2692   /* The first group of checks is independent of the vector size.  */
2693   fatal = true;
2694
2695   if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2696       && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2697     return opt_result::failure_at (vect_location,
2698                                    "not vectorized: simd if(0)\n");
2699
2700   /* Find all data references in the loop (which correspond to vdefs/vuses)
2701      and analyze their evolution in the loop.  */
2702
2703   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2704
2705   /* Gather the data references and count stmts in the loop.  */
2706   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2707     {
2708       opt_result res
2709         = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2710                                      &LOOP_VINFO_DATAREFS (loop_vinfo),
2711                                      &LOOP_VINFO_N_STMTS (loop_vinfo));
2712       if (!res)
2713         {
2714           if (dump_enabled_p ())
2715             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2716                              "not vectorized: loop contains function "
2717                              "calls or data references that cannot "
2718                              "be analyzed\n");
2719           return res;
2720         }
2721       loop_vinfo->shared->save_datarefs ();
2722     }
2723   else
2724     loop_vinfo->shared->check_datarefs ();
2725
2726   /* Analyze the data references and also adjust the minimal
2727      vectorization factor according to the loads and stores.  */
2728
2729   ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2730   if (!ok)
2731     {
2732       if (dump_enabled_p ())
2733         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2734                          "bad data references.\n");
2735       return ok;
2736     }
2737
2738   /* Check if we are applying unroll factor now.  */
2739   bool applying_suggested_uf = loop_vinfo->suggested_unroll_factor > 1;
2740   gcc_assert (!applying_suggested_uf || !suggested_unroll_factor);
2741
2742   /* If the slp decision is false when suggested unroll factor is worked
2743      out, and we are applying suggested unroll factor, we can simply skip
2744      all slp related analyses this time.  */
2745   bool slp = !applying_suggested_uf || slp_done_for_suggested_uf;
2746
2747   /* Classify all cross-iteration scalar data-flow cycles.
2748      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
2749   vect_analyze_scalar_cycles (loop_vinfo, slp);
2750
2751   vect_pattern_recog (loop_vinfo);
2752
2753   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2754
2755   /* Analyze the access patterns of the data-refs in the loop (consecutive,
2756      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
2757
2758   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2759   if (!ok)
2760     {
2761       if (dump_enabled_p ())
2762         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2763                          "bad data access.\n");
2764       return ok;
2765     }
2766
2767   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
2768
2769   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2770   if (!ok)
2771     {
2772       if (dump_enabled_p ())
2773         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2774                          "unexpected pattern.\n");
2775       return ok;
2776     }
2777
2778   /* While the rest of the analysis below depends on it in some way.  */
2779   fatal = false;
2780
2781   /* Analyze data dependences between the data-refs in the loop
2782      and adjust the maximum vectorization factor according to
2783      the dependences.
2784      FORNOW: fail at the first data dependence that we encounter.  */
2785
2786   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2787   if (!ok)
2788     {
2789       if (dump_enabled_p ())
2790         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2791                          "bad data dependence.\n");
2792       return ok;
2793     }
2794   if (max_vf != MAX_VECTORIZATION_FACTOR
2795       && maybe_lt (max_vf, min_vf))
2796     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2797   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2798
2799   ok = vect_determine_vectorization_factor (loop_vinfo);
2800   if (!ok)
2801     {
2802       if (dump_enabled_p ())
2803         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2804                          "can't determine vectorization factor.\n");
2805       return ok;
2806     }
2807   if (max_vf != MAX_VECTORIZATION_FACTOR
2808       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2809     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2810
2811   /* Compute the scalar iteration cost.  */
2812   vect_compute_single_scalar_iteration_cost (loop_vinfo);
2813
2814   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2815
2816   if (slp)
2817     {
2818       /* Check the SLP opportunities in the loop, analyze and build
2819          SLP trees.  */
2820       ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo));
2821       if (!ok)
2822         return ok;
2823
2824       /* If there are any SLP instances mark them as pure_slp.  */
2825       slp = vect_make_slp_decision (loop_vinfo);
2826       if (slp)
2827         {
2828           /* Find stmts that need to be both vectorized and SLPed.  */
2829           vect_detect_hybrid_slp (loop_vinfo);
2830
2831           /* Update the vectorization factor based on the SLP decision.  */
2832           vect_update_vf_for_slp (loop_vinfo);
2833
2834           /* Optimize the SLP graph with the vectorization factor fixed.  */
2835           vect_optimize_slp (loop_vinfo);
2836
2837           /* Gather the loads reachable from the SLP graph entries.  */
2838           vect_gather_slp_loads (loop_vinfo);
2839         }
2840     }
2841
2842   bool saved_can_use_partial_vectors_p
2843     = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2844
2845   /* We don't expect to have to roll back to anything other than an empty
2846      set of rgroups.  */
2847   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2848
2849   /* This is the point where we can re-start analysis with SLP forced off.  */
2850 start_over:
2851
2852   /* Apply the suggested unrolling factor, this was determined by the backend
2853      during finish_cost the first time we ran the analyzis for this
2854      vector mode.  */
2855   if (applying_suggested_uf)
2856     LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2857
2858   /* Now the vectorization factor is final.  */
2859   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2860   gcc_assert (known_ne (vectorization_factor, 0U));
2861
2862   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2863     {
2864       dump_printf_loc (MSG_NOTE, vect_location,
2865                        "vectorization_factor = ");
2866       dump_dec (MSG_NOTE, vectorization_factor);
2867       dump_printf (MSG_NOTE, ", niters = %wd\n",
2868                    LOOP_VINFO_INT_NITERS (loop_vinfo));
2869     }
2870
2871   loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2872
2873   /* Analyze the alignment of the data-refs in the loop.
2874      Fail if a data reference is found that cannot be vectorized.  */
2875
2876   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2877   if (!ok)
2878     {
2879       if (dump_enabled_p ())
2880         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2881                          "bad data alignment.\n");
2882       return ok;
2883     }
2884
2885   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2886      It is important to call pruning after vect_analyze_data_ref_accesses,
2887      since we use grouping information gathered by interleaving analysis.  */
2888   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2889   if (!ok)
2890     return ok;
2891
2892   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2893      vectorization, since we do not want to add extra peeling or
2894      add versioning for alignment.  */
2895   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2896     /* This pass will decide on using loop versioning and/or loop peeling in
2897        order to enhance the alignment of data references in the loop.  */
2898     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2899   if (!ok)
2900     return ok;
2901
2902   if (slp)
2903     {
2904       /* Analyze operations in the SLP instances.  Note this may
2905          remove unsupported SLP instances which makes the above
2906          SLP kind detection invalid.  */
2907       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2908       vect_slp_analyze_operations (loop_vinfo);
2909       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2910         {
2911           ok = opt_result::failure_at (vect_location,
2912                                        "unsupported SLP instances\n");
2913           goto again;
2914         }
2915
2916       /* Check whether any load in ALL SLP instances is possibly permuted.  */
2917       slp_tree load_node, slp_root;
2918       unsigned i, x;
2919       slp_instance instance;
2920       bool can_use_lanes = true;
2921       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2922         {
2923           slp_root = SLP_INSTANCE_TREE (instance);
2924           int group_size = SLP_TREE_LANES (slp_root);
2925           tree vectype = SLP_TREE_VECTYPE (slp_root);
2926           bool loads_permuted = false;
2927           FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2928             {
2929               if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2930                 continue;
2931               unsigned j;
2932               stmt_vec_info load_info;
2933               FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2934                 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2935                   {
2936                     loads_permuted = true;
2937                     break;
2938                   }
2939             }
2940
2941           /* If the loads and stores can be handled with load/store-lane
2942              instructions record it and move on to the next instance.  */
2943           if (loads_permuted
2944               && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2945               && vect_store_lanes_supported (vectype, group_size, false)
2946                    != IFN_LAST)
2947             {
2948               FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2949                 if (STMT_VINFO_GROUPED_ACCESS
2950                       (SLP_TREE_REPRESENTATIVE (load_node)))
2951                   {
2952                     stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2953                         (SLP_TREE_REPRESENTATIVE (load_node));
2954                     /* Use SLP for strided accesses (or if we can't
2955                        load-lanes).  */
2956                     if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2957                         || vect_load_lanes_supported
2958                              (STMT_VINFO_VECTYPE (stmt_vinfo),
2959                               DR_GROUP_SIZE (stmt_vinfo), false) == IFN_LAST)
2960                       break;
2961                   }
2962
2963               can_use_lanes
2964                 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
2965
2966               if (can_use_lanes && dump_enabled_p ())
2967                 dump_printf_loc (MSG_NOTE, vect_location,
2968                                  "SLP instance %p can use load/store-lanes\n",
2969                                  (void *) instance);
2970             }
2971           else
2972             {
2973               can_use_lanes = false;
2974               break;
2975             }
2976         }
2977
2978       /* If all SLP instances can use load/store-lanes abort SLP and try again
2979          with SLP disabled.  */
2980       if (can_use_lanes)
2981         {
2982           ok = opt_result::failure_at (vect_location,
2983                                        "Built SLP cancelled: can use "
2984                                        "load/store-lanes\n");
2985           if (dump_enabled_p ())
2986             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2987                              "Built SLP cancelled: all SLP instances support "
2988                              "load/store-lanes\n");
2989           goto again;
2990         }
2991     }
2992
2993   /* Dissolve SLP-only groups.  */
2994   vect_dissolve_slp_only_groups (loop_vinfo);
2995
2996   /* Scan all the remaining operations in the loop that are not subject
2997      to SLP and make sure they are vectorizable.  */
2998   ok = vect_analyze_loop_operations (loop_vinfo);
2999   if (!ok)
3000     {
3001       if (dump_enabled_p ())
3002         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3003                          "bad operation or unsupported loop bound.\n");
3004       return ok;
3005     }
3006
3007   /* For now, we don't expect to mix both masking and length approaches for one
3008      loop, disable it if both are recorded.  */
3009   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3010       && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
3011       && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
3012     {
3013       if (dump_enabled_p ())
3014         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3015                          "can't vectorize a loop with partial vectors"
3016                          " because we don't expect to mix different"
3017                          " approaches with partial vectors for the"
3018                          " same loop.\n");
3019       LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3020     }
3021
3022   /* If we still have the option of using partial vectors,
3023      check whether we can generate the necessary loop controls.  */
3024   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
3025     {
3026       if (!LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
3027         {
3028           if (!vect_verify_full_masking (loop_vinfo)
3029               && !vect_verify_full_masking_avx512 (loop_vinfo))
3030             LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3031         }
3032       else /* !LOOP_VINFO_LENS (loop_vinfo).is_empty () */
3033         if (!vect_verify_loop_lens (loop_vinfo))
3034           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3035     }
3036
3037   /* If we're vectorizing a loop that uses length "controls" and
3038      can iterate more than once, we apply decrementing IV approach
3039      in loop control.  */
3040   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3041       && LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_len
3042       && LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0
3043       && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3044            && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
3045                         LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
3046     LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
3047
3048   /* If a loop uses length controls and has a decrementing loop control IV,
3049      we will normally pass that IV through a MIN_EXPR to calcaluate the
3050      basis for the length controls.  E.g. in a loop that processes one
3051      element per scalar iteration, the number of elements would be
3052      MIN_EXPR <N, VF>, where N is the number of scalar iterations left.
3053
3054      This MIN_EXPR approach allows us to use pointer IVs with an invariant
3055      step, since only the final iteration of the vector loop can have
3056      inactive lanes.
3057
3058      However, some targets have a dedicated instruction for calculating the
3059      preferred length, given the total number of elements that still need to
3060      be processed.  This is encapsulated in the SELECT_VL internal function.
3061
3062      If the target supports SELECT_VL, we can use it instead of MIN_EXPR
3063      to determine the basis for the length controls.  However, unlike the
3064      MIN_EXPR calculation, the SELECT_VL calculation can decide to make
3065      lanes inactive in any iteration of the vector loop, not just the last
3066      iteration.  This SELECT_VL approach therefore requires us to use pointer
3067      IVs with variable steps.
3068
3069      Once we've decided how many elements should be processed by one
3070      iteration of the vector loop, we need to populate the rgroup controls.
3071      If a loop has multiple rgroups, we need to make sure that those rgroups
3072      "line up" (that is, they must be consistent about which elements are
3073      active and which aren't).  This is done by vect_adjust_loop_lens_control.
3074
3075      In principle, it would be possible to use vect_adjust_loop_lens_control
3076      on either the result of a MIN_EXPR or the result of a SELECT_VL.
3077      However:
3078
3079      (1) In practice, it only makes sense to use SELECT_VL when a vector
3080          operation will be controlled directly by the result.  It is not
3081          worth using SELECT_VL if it would only be the input to other
3082          calculations.
3083
3084      (2) If we use SELECT_VL for an rgroup that has N controls, each associated
3085          pointer IV will need N updates by a variable amount (N-1 updates
3086          within the iteration and 1 update to move to the next iteration).
3087
3088      Because of this, we prefer to use the MIN_EXPR approach whenever there
3089      is more than one length control.
3090
3091      In addition, SELECT_VL always operates to a granularity of 1 unit.
3092      If we wanted to use it to control an SLP operation on N consecutive
3093      elements, we would need to make the SELECT_VL inputs measure scalar
3094      iterations (rather than elements) and then multiply the SELECT_VL
3095      result by N.  But using SELECT_VL this way is inefficient because
3096      of (1) above.
3097
3098      2. We don't apply SELECT_VL on single-rgroup when both (1) and (2) are
3099         satisfied:
3100
3101      (1). LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) is true.
3102      (2). LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () is true.
3103
3104      Since SELECT_VL (variable step) will make SCEV analysis failed and then
3105      we will fail to gain benefits of following unroll optimizations. We prefer
3106      using the MIN_EXPR approach in this situation.  */
3107   if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
3108     {
3109       tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
3110       if (direct_internal_fn_supported_p (IFN_SELECT_VL, iv_type,
3111                                           OPTIMIZE_FOR_SPEED)
3112           && LOOP_VINFO_LENS (loop_vinfo).length () == 1
3113           && LOOP_VINFO_LENS (loop_vinfo)[0].factor == 1 && !slp
3114           && (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3115               || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()))
3116         LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true;
3117     }
3118
3119   /* Decide whether this loop_vinfo should use partial vectors or peeling,
3120      assuming that the loop will be used as a main loop.  We will redo
3121      this analysis later if we instead decide to use the loop as an
3122      epilogue loop.  */
3123   ok = vect_determine_partial_vectors_and_peeling (loop_vinfo);
3124   if (!ok)
3125     return ok;
3126
3127   /* If we're vectorizing an epilogue loop, the vectorized loop either needs
3128      to be able to handle fewer than VF scalars, or needs to have a lower VF
3129      than the main loop.  */
3130   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
3131       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3132     {
3133       poly_uint64 unscaled_vf
3134         = exact_div (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
3135                      orig_loop_vinfo->suggested_unroll_factor);
3136       if (maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo), unscaled_vf))
3137         return opt_result::failure_at (vect_location,
3138                                        "Vectorization factor too high for"
3139                                        " epilogue loop.\n");
3140     }
3141
3142   /* Check the costings of the loop make vectorizing worthwhile.  */
3143   res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
3144   if (res < 0)
3145     {
3146       ok = opt_result::failure_at (vect_location,
3147                                    "Loop costings may not be worthwhile.\n");
3148       goto again;
3149     }
3150   if (!res)
3151     return opt_result::failure_at (vect_location,
3152                                    "Loop costings not worthwhile.\n");
3153
3154   /* If an epilogue loop is required make sure we can create one.  */
3155   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
3156       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
3157     {
3158       if (dump_enabled_p ())
3159         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
3160       if (!vect_can_advance_ivs_p (loop_vinfo)
3161           || !slpeel_can_duplicate_loop_p (loop,
3162                                            LOOP_VINFO_IV_EXIT (loop_vinfo),
3163                                            LOOP_VINFO_IV_EXIT (loop_vinfo)))
3164         {
3165           ok = opt_result::failure_at (vect_location,
3166                                        "not vectorized: can't create required "
3167                                        "epilog loop\n");
3168           goto again;
3169         }
3170     }
3171
3172   /* During peeling, we need to check if number of loop iterations is
3173      enough for both peeled prolog loop and vector loop.  This check
3174      can be merged along with threshold check of loop versioning, so
3175      increase threshold for this case if necessary.
3176
3177      If we are analyzing an epilogue we still want to check what its
3178      versioning threshold would be.  If we decide to vectorize the epilogues we
3179      will want to use the lowest versioning threshold of all epilogues and main
3180      loop.  This will enable us to enter a vectorized epilogue even when
3181      versioning the loop.  We can't simply check whether the epilogue requires
3182      versioning though since we may have skipped some versioning checks when
3183      analyzing the epilogue.  For instance, checks for alias versioning will be
3184      skipped when dealing with epilogues as we assume we already checked them
3185      for the main loop.  So instead we always check the 'orig_loop_vinfo'.  */
3186   if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
3187     {
3188       poly_uint64 niters_th = 0;
3189       unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3190
3191       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
3192         {
3193           /* Niters for peeled prolog loop.  */
3194           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3195             {
3196               dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
3197               tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
3198               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
3199             }
3200           else
3201             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3202         }
3203
3204       /* Niters for at least one iteration of vectorized loop.  */
3205       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3206         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3207       /* One additional iteration because of peeling for gap.  */
3208       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3209         niters_th += 1;
3210
3211       /*  Use the same condition as vect_transform_loop to decide when to use
3212           the cost to determine a versioning threshold.  */
3213       if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3214           && ordered_p (th, niters_th))
3215         niters_th = ordered_max (poly_uint64 (th), niters_th);
3216
3217       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
3218     }
3219
3220   gcc_assert (known_eq (vectorization_factor,
3221                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
3222
3223   slp_done_for_suggested_uf = slp;
3224
3225   /* Ok to vectorize!  */
3226   LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
3227   return opt_result::success ();
3228
3229 again:
3230   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
3231   gcc_assert (!ok);
3232
3233   /* Try again with SLP forced off but if we didn't do any SLP there is
3234      no point in re-trying.  */
3235   if (!slp)
3236     return ok;
3237
3238   /* If the slp decision is true when suggested unroll factor is worked
3239      out, and we are applying suggested unroll factor, we don't need to
3240      re-try any more.  */
3241   if (applying_suggested_uf && slp_done_for_suggested_uf)
3242     return ok;
3243
3244   /* If there are reduction chains re-trying will fail anyway.  */
3245   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
3246     return ok;
3247
3248   /* Likewise if the grouped loads or stores in the SLP cannot be handled
3249      via interleaving or lane instructions.  */
3250   slp_instance instance;
3251   slp_tree node;
3252   unsigned i, j;
3253   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
3254     {
3255       stmt_vec_info vinfo;
3256       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
3257       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
3258         continue;
3259       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3260       unsigned int size = DR_GROUP_SIZE (vinfo);
3261       tree vectype = STMT_VINFO_VECTYPE (vinfo);
3262       if (vect_store_lanes_supported (vectype, size, false) == IFN_LAST
3263          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
3264          && ! vect_grouped_store_supported (vectype, size))
3265         return opt_result::failure_at (vinfo->stmt,
3266                                        "unsupported grouped store\n");
3267       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
3268         {
3269           vinfo = SLP_TREE_REPRESENTATIVE (node);
3270           if (STMT_VINFO_GROUPED_ACCESS (vinfo))
3271             {
3272               vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3273               bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
3274               size = DR_GROUP_SIZE (vinfo);
3275               vectype = STMT_VINFO_VECTYPE (vinfo);
3276               if (vect_load_lanes_supported (vectype, size, false) == IFN_LAST
3277                   && ! vect_grouped_load_supported (vectype, single_element_p,
3278                                                     size))
3279                 return opt_result::failure_at (vinfo->stmt,
3280                                                "unsupported grouped load\n");
3281             }
3282         }
3283     }
3284
3285   if (dump_enabled_p ())
3286     dump_printf_loc (MSG_NOTE, vect_location,
3287                      "re-trying with SLP disabled\n");
3288
3289   /* Roll back state appropriately.  No SLP this time.  */
3290   slp = false;
3291   /* Restore vectorization factor as it were without SLP.  */
3292   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
3293   /* Free the SLP instances.  */
3294   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
3295     vect_free_slp_instance (instance);
3296   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
3297   /* Reset SLP type to loop_vect on all stmts.  */
3298   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
3299     {
3300       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3301       for (gimple_stmt_iterator si = gsi_start_phis (bb);
3302            !gsi_end_p (si); gsi_next (&si))
3303         {
3304           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3305           STMT_SLP_TYPE (stmt_info) = loop_vect;
3306           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
3307               || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
3308             {
3309               /* vectorizable_reduction adjusts reduction stmt def-types,
3310                  restore them to that of the PHI.  */
3311               STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
3312                 = STMT_VINFO_DEF_TYPE (stmt_info);
3313               STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
3314                                         (STMT_VINFO_REDUC_DEF (stmt_info)))
3315                 = STMT_VINFO_DEF_TYPE (stmt_info);
3316             }
3317         }
3318       for (gimple_stmt_iterator si = gsi_start_bb (bb);
3319            !gsi_end_p (si); gsi_next (&si))
3320         {
3321           if (is_gimple_debug (gsi_stmt (si)))
3322             continue;
3323           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3324           STMT_SLP_TYPE (stmt_info) = loop_vect;
3325           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
3326             {
3327               stmt_vec_info pattern_stmt_info
3328                 = STMT_VINFO_RELATED_STMT (stmt_info);
3329               if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
3330                 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
3331
3332               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
3333               STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
3334               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
3335                    !gsi_end_p (pi); gsi_next (&pi))
3336                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
3337                   = loop_vect;
3338             }
3339         }
3340     }
3341   /* Free optimized alias test DDRS.  */
3342   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
3343   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
3344   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
3345   /* Reset target cost data.  */
3346   delete loop_vinfo->vector_costs;
3347   loop_vinfo->vector_costs = nullptr;
3348   /* Reset accumulated rgroup information.  */
3349   LOOP_VINFO_MASKS (loop_vinfo).mask_set.empty ();
3350   release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
3351   release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
3352   /* Reset assorted flags.  */
3353   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
3354   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
3355   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
3356   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
3357   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3358     = saved_can_use_partial_vectors_p;
3359
3360   goto start_over;
3361 }
3362
3363 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
3364    to be better than vectorizing it using OLD_LOOP_VINFO.  Assume that
3365    OLD_LOOP_VINFO is better unless something specifically indicates
3366    otherwise.
3367
3368    Note that this deliberately isn't a partial order.  */
3369
3370 static bool
3371 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
3372                           loop_vec_info old_loop_vinfo)
3373 {
3374   struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
3375   gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
3376
3377   poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
3378   poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
3379
3380   /* Always prefer a VF of loop->simdlen over any other VF.  */
3381   if (loop->simdlen)
3382     {
3383       bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
3384       bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
3385       if (new_simdlen_p != old_simdlen_p)
3386         return new_simdlen_p;
3387     }
3388
3389   const auto *old_costs = old_loop_vinfo->vector_costs;
3390   const auto *new_costs = new_loop_vinfo->vector_costs;
3391   if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
3392     return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
3393
3394   return new_costs->better_main_loop_than_p (old_costs);
3395 }
3396
3397 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO.  Return
3398    true if we should.  */
3399
3400 static bool
3401 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
3402                         loop_vec_info old_loop_vinfo)
3403 {
3404   if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
3405     return false;
3406
3407   if (dump_enabled_p ())
3408     dump_printf_loc (MSG_NOTE, vect_location,
3409                      "***** Preferring vector mode %s to vector mode %s\n",
3410                      GET_MODE_NAME (new_loop_vinfo->vector_mode),
3411                      GET_MODE_NAME (old_loop_vinfo->vector_mode));
3412   return true;
3413 }
3414
3415 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
3416    not NULL.  Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
3417    MODE_I to the next mode useful to analyze.
3418    Return the loop_vinfo on success and wrapped null on failure.  */
3419
3420 static opt_loop_vec_info
3421 vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
3422                      const vect_loop_form_info *loop_form_info,
3423                      loop_vec_info main_loop_vinfo,
3424                      const vector_modes &vector_modes, unsigned &mode_i,
3425                      machine_mode &autodetected_vector_mode,
3426                      bool &fatal)
3427 {
3428   loop_vec_info loop_vinfo
3429     = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3430
3431   machine_mode vector_mode = vector_modes[mode_i];
3432   loop_vinfo->vector_mode = vector_mode;
3433   unsigned int suggested_unroll_factor = 1;
3434   bool slp_done_for_suggested_uf = false;
3435
3436   /* Run the main analysis.  */
3437   opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,
3438                                         &suggested_unroll_factor,
3439                                         slp_done_for_suggested_uf);
3440   if (dump_enabled_p ())
3441     dump_printf_loc (MSG_NOTE, vect_location,
3442                      "***** Analysis %s with vector mode %s\n",
3443                      res ? "succeeded" : " failed",
3444                      GET_MODE_NAME (loop_vinfo->vector_mode));
3445
3446   if (res && !main_loop_vinfo && suggested_unroll_factor > 1)
3447     {
3448       if (dump_enabled_p ())
3449         dump_printf_loc (MSG_NOTE, vect_location,
3450                          "***** Re-trying analysis for unrolling"
3451                          " with unroll factor %d and slp %s.\n",
3452                          suggested_unroll_factor,
3453                          slp_done_for_suggested_uf ? "on" : "off");
3454       loop_vec_info unroll_vinfo
3455         = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3456       unroll_vinfo->vector_mode = vector_mode;
3457       unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
3458       opt_result new_res = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL,
3459                                                 slp_done_for_suggested_uf);
3460       if (new_res)
3461         {
3462           delete loop_vinfo;
3463           loop_vinfo = unroll_vinfo;
3464         }
3465       else
3466         delete unroll_vinfo;
3467     }
3468
3469   /* Remember the autodetected vector mode.  */
3470   if (vector_mode == VOIDmode)
3471     autodetected_vector_mode = loop_vinfo->vector_mode;
3472
3473   /* Advance mode_i, first skipping modes that would result in the
3474      same analysis result.  */
3475   while (mode_i + 1 < vector_modes.length ()
3476          && vect_chooses_same_modes_p (loop_vinfo,
3477                                        vector_modes[mode_i + 1]))
3478     {
3479       if (dump_enabled_p ())
3480         dump_printf_loc (MSG_NOTE, vect_location,
3481                          "***** The result for vector mode %s would"
3482                          " be the same\n",
3483                          GET_MODE_NAME (vector_modes[mode_i + 1]));
3484       mode_i += 1;
3485     }
3486   if (mode_i + 1 < vector_modes.length ()
3487       && VECTOR_MODE_P (autodetected_vector_mode)
3488       && (related_vector_mode (vector_modes[mode_i + 1],
3489                                GET_MODE_INNER (autodetected_vector_mode))
3490           == autodetected_vector_mode)
3491       && (related_vector_mode (autodetected_vector_mode,
3492                                GET_MODE_INNER (vector_modes[mode_i + 1]))
3493           == vector_modes[mode_i + 1]))
3494     {
3495       if (dump_enabled_p ())
3496         dump_printf_loc (MSG_NOTE, vect_location,
3497                          "***** Skipping vector mode %s, which would"
3498                          " repeat the analysis for %s\n",
3499                          GET_MODE_NAME (vector_modes[mode_i + 1]),
3500                          GET_MODE_NAME (autodetected_vector_mode));
3501       mode_i += 1;
3502     }
3503   mode_i++;
3504
3505   if (!res)
3506     {
3507       delete loop_vinfo;
3508       if (fatal)
3509         gcc_checking_assert (main_loop_vinfo == NULL);
3510       return opt_loop_vec_info::propagate_failure (res);
3511     }
3512
3513   return opt_loop_vec_info::success (loop_vinfo);
3514 }
3515
3516 /* Function vect_analyze_loop.
3517
3518    Apply a set of analyses on LOOP, and create a loop_vec_info struct
3519    for it.  The different analyses will record information in the
3520    loop_vec_info struct.  */
3521 opt_loop_vec_info
3522 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
3523 {
3524   DUMP_VECT_SCOPE ("analyze_loop_nest");
3525
3526   if (loop_outer (loop)
3527       && loop_vec_info_for_loop (loop_outer (loop))
3528       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
3529     return opt_loop_vec_info::failure_at (vect_location,
3530                                           "outer-loop already vectorized.\n");
3531
3532   if (!find_loop_nest (loop, &shared->loop_nest))
3533     return opt_loop_vec_info::failure_at
3534       (vect_location,
3535        "not vectorized: loop nest containing two or more consecutive inner"
3536        " loops cannot be vectorized\n");
3537
3538   /* Analyze the loop form.  */
3539   vect_loop_form_info loop_form_info;
3540   opt_result res = vect_analyze_loop_form (loop, &loop_form_info);
3541   if (!res)
3542     {
3543       if (dump_enabled_p ())
3544         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3545                          "bad loop form.\n");
3546       return opt_loop_vec_info::propagate_failure (res);
3547     }
3548   if (!integer_onep (loop_form_info.assumptions))
3549     {
3550       /* We consider to vectorize this loop by versioning it under
3551          some assumptions.  In order to do this, we need to clear
3552          existing information computed by scev and niter analyzer.  */
3553       scev_reset_htab ();
3554       free_numbers_of_iterations_estimates (loop);
3555       /* Also set flag for this loop so that following scev and niter
3556          analysis are done under the assumptions.  */
3557       loop_constraint_set (loop, LOOP_C_FINITE);
3558     }
3559
3560   auto_vector_modes vector_modes;
3561   /* Autodetect first vector size we try.  */
3562   vector_modes.safe_push (VOIDmode);
3563   unsigned int autovec_flags
3564     = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
3565                                                     loop->simdlen != 0);
3566   bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
3567                              && !unlimited_cost_model (loop));
3568   machine_mode autodetected_vector_mode = VOIDmode;
3569   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
3570   unsigned int mode_i = 0;
3571   unsigned HOST_WIDE_INT simdlen = loop->simdlen;
3572
3573   /* Keep track of the VF for each mode.  Initialize all to 0 which indicates
3574      a mode has not been analyzed.  */
3575   auto_vec<poly_uint64, 8> cached_vf_per_mode;
3576   for (unsigned i = 0; i < vector_modes.length (); ++i)
3577     cached_vf_per_mode.safe_push (0);
3578
3579   /* First determine the main loop vectorization mode, either the first
3580      one that works, starting with auto-detecting the vector mode and then
3581      following the targets order of preference, or the one with the
3582      lowest cost if pick_lowest_cost_p.  */
3583   while (1)
3584     {
3585       bool fatal;
3586       unsigned int last_mode_i = mode_i;
3587       /* Set cached VF to -1 prior to analysis, which indicates a mode has
3588          failed.  */
3589       cached_vf_per_mode[last_mode_i] = -1;
3590       opt_loop_vec_info loop_vinfo
3591         = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3592                                NULL, vector_modes, mode_i,
3593                                autodetected_vector_mode, fatal);
3594       if (fatal)
3595         break;
3596
3597       if (loop_vinfo)
3598         {
3599           /*  Analyzis has been successful so update the VF value.  The
3600               VF should always be a multiple of unroll_factor and we want to
3601               capture the original VF here.  */
3602           cached_vf_per_mode[last_mode_i]
3603             = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
3604                          loop_vinfo->suggested_unroll_factor);
3605           /* Once we hit the desired simdlen for the first time,
3606              discard any previous attempts.  */
3607           if (simdlen
3608               && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3609             {
3610               delete first_loop_vinfo;
3611               first_loop_vinfo = opt_loop_vec_info::success (NULL);
3612               simdlen = 0;
3613             }
3614           else if (pick_lowest_cost_p
3615                    && first_loop_vinfo
3616                    && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3617             {
3618               /* Pick loop_vinfo over first_loop_vinfo.  */
3619               delete first_loop_vinfo;
3620               first_loop_vinfo = opt_loop_vec_info::success (NULL);
3621             }
3622           if (first_loop_vinfo == NULL)
3623             first_loop_vinfo = loop_vinfo;
3624           else
3625             {
3626               delete loop_vinfo;
3627               loop_vinfo = opt_loop_vec_info::success (NULL);
3628             }
3629
3630           /* Commit to first_loop_vinfo if we have no reason to try
3631              alternatives.  */
3632           if (!simdlen && !pick_lowest_cost_p)
3633             break;
3634         }
3635       if (mode_i == vector_modes.length ()
3636           || autodetected_vector_mode == VOIDmode)
3637         break;
3638
3639       /* Try the next biggest vector size.  */
3640       if (dump_enabled_p ())
3641         dump_printf_loc (MSG_NOTE, vect_location,
3642                          "***** Re-trying analysis with vector mode %s\n",
3643                          GET_MODE_NAME (vector_modes[mode_i]));
3644     }
3645   if (!first_loop_vinfo)
3646     return opt_loop_vec_info::propagate_failure (res);
3647
3648   if (dump_enabled_p ())
3649     dump_printf_loc (MSG_NOTE, vect_location,
3650                      "***** Choosing vector mode %s\n",
3651                      GET_MODE_NAME (first_loop_vinfo->vector_mode));
3652
3653   /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3654      enabled, SIMDUID is not set, it is the innermost loop and we have
3655      either already found the loop's SIMDLEN or there was no SIMDLEN to
3656      begin with.
3657      TODO: Enable epilogue vectorization for loops with SIMDUID set.  */
3658   bool vect_epilogues = (!simdlen
3659                          && loop->inner == NULL
3660                          && param_vect_epilogues_nomask
3661                          && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3662                          && !loop->simduid);
3663   if (!vect_epilogues)
3664     return first_loop_vinfo;
3665
3666   /* Now analyze first_loop_vinfo for epilogue vectorization.  */
3667   poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3668
3669   /* For epilogues start the analysis from the first mode.  The motivation
3670      behind starting from the beginning comes from cases where the VECTOR_MODES
3671      array may contain length-agnostic and length-specific modes.  Their
3672      ordering is not guaranteed, so we could end up picking a mode for the main
3673      loop that is after the epilogue's optimal mode.  */
3674   vector_modes[0] = autodetected_vector_mode;
3675   mode_i = 0;
3676
3677   bool supports_partial_vectors =
3678     partial_vectors_supported_p () && param_vect_partial_vector_usage != 0;
3679   poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3680
3681   while (1)
3682     {
3683       /* If the target does not support partial vectors we can shorten the
3684          number of modes to analyze for the epilogue as we know we can't pick a
3685          mode that would lead to a VF at least as big as the
3686          FIRST_VINFO_VF.  */
3687       if (!supports_partial_vectors
3688           && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3689         {
3690           mode_i++;
3691           if (mode_i == vector_modes.length ())
3692             break;
3693           continue;
3694         }
3695
3696       if (dump_enabled_p ())
3697         dump_printf_loc (MSG_NOTE, vect_location,
3698                          "***** Re-trying epilogue analysis with vector "
3699                          "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3700
3701       bool fatal;
3702       opt_loop_vec_info loop_vinfo
3703         = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3704                                first_loop_vinfo,
3705                                vector_modes, mode_i,
3706                                autodetected_vector_mode, fatal);
3707       if (fatal)
3708         break;
3709
3710       if (loop_vinfo)
3711         {
3712           if (pick_lowest_cost_p)
3713             {
3714               /* Keep trying to roll back vectorization attempts while the
3715                  loop_vec_infos they produced were worse than this one.  */
3716               vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3717               while (!vinfos.is_empty ()
3718                      && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3719                 {
3720                   gcc_assert (vect_epilogues);
3721                   delete vinfos.pop ();
3722                 }
3723             }
3724           /* For now only allow one epilogue loop.  */
3725           if (first_loop_vinfo->epilogue_vinfos.is_empty ())
3726             {
3727               first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3728               poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3729               gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3730                           || maybe_ne (lowest_th, 0U));
3731               /* Keep track of the known smallest versioning
3732                  threshold.  */
3733               if (ordered_p (lowest_th, th))
3734                 lowest_th = ordered_min (lowest_th, th);
3735             }
3736           else
3737             {
3738               delete loop_vinfo;
3739               loop_vinfo = opt_loop_vec_info::success (NULL);
3740             }
3741
3742           /* For now only allow one epilogue loop, but allow
3743              pick_lowest_cost_p to replace it, so commit to the
3744              first epilogue if we have no reason to try alternatives.  */
3745           if (!pick_lowest_cost_p)
3746             break;
3747         }
3748
3749       if (mode_i == vector_modes.length ())
3750         break;
3751
3752     }
3753
3754   if (!first_loop_vinfo->epilogue_vinfos.is_empty ())
3755     {
3756       LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3757       if (dump_enabled_p ())
3758         dump_printf_loc (MSG_NOTE, vect_location,
3759                          "***** Choosing epilogue vector mode %s\n",
3760                          GET_MODE_NAME
3761                            (first_loop_vinfo->epilogue_vinfos[0]->vector_mode));
3762     }
3763
3764   return first_loop_vinfo;
3765 }
3766
3767 /* Return true if there is an in-order reduction function for CODE, storing
3768    it in *REDUC_FN if so.  */
3769
3770 static bool
3771 fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3772 {
3773   /* We support MINUS_EXPR by negating the operand.  This also preserves an
3774      initial -0.0 since -0.0 - 0.0 (neutral op for MINUS_EXPR) == -0.0 +
3775      (-0.0) = -0.0.  */
3776   if (code == PLUS_EXPR || code == MINUS_EXPR)
3777     {
3778       *reduc_fn = IFN_FOLD_LEFT_PLUS;
3779       return true;
3780     }
3781   return false;
3782 }
3783
3784 /* Function reduction_fn_for_scalar_code
3785
3786    Input:
3787    CODE - tree_code of a reduction operations.
3788
3789    Output:
3790    REDUC_FN - the corresponding internal function to be used to reduce the
3791       vector of partial results into a single scalar result, or IFN_LAST
3792       if the operation is a supported reduction operation, but does not have
3793       such an internal function.
3794
3795    Return FALSE if CODE currently cannot be vectorized as reduction.  */
3796
3797 bool
3798 reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3799 {
3800   if (code.is_tree_code ())
3801     switch (tree_code (code))
3802       {
3803       case MAX_EXPR:
3804         *reduc_fn = IFN_REDUC_MAX;
3805         return true;
3806
3807       case MIN_EXPR:
3808         *reduc_fn = IFN_REDUC_MIN;
3809         return true;
3810
3811       case PLUS_EXPR:
3812         *reduc_fn = IFN_REDUC_PLUS;
3813         return true;
3814
3815       case BIT_AND_EXPR:
3816         *reduc_fn = IFN_REDUC_AND;
3817         return true;
3818
3819       case BIT_IOR_EXPR:
3820         *reduc_fn = IFN_REDUC_IOR;
3821         return true;
3822
3823       case BIT_XOR_EXPR:
3824         *reduc_fn = IFN_REDUC_XOR;
3825         return true;
3826
3827       case MULT_EXPR:
3828       case MINUS_EXPR:
3829         *reduc_fn = IFN_LAST;
3830         return true;
3831
3832       default:
3833         return false;
3834       }
3835   else
3836     switch (combined_fn (code))
3837       {
3838       CASE_CFN_FMAX:
3839         *reduc_fn = IFN_REDUC_FMAX;
3840         return true;
3841
3842       CASE_CFN_FMIN:
3843         *reduc_fn = IFN_REDUC_FMIN;
3844         return true;
3845
3846       default:
3847         return false;
3848       }
3849 }
3850
3851 /* If there is a neutral value X such that a reduction would not be affected
3852    by the introduction of additional X elements, return that X, otherwise
3853    return null.  CODE is the code of the reduction and SCALAR_TYPE is type
3854    of the scalar elements.  If the reduction has just a single initial value
3855    then INITIAL_VALUE is that value, otherwise it is null.
3856    If AS_INITIAL is TRUE the value is supposed to be used as initial value.
3857    In that case no signed zero is returned.  */
3858
3859 tree
3860 neutral_op_for_reduction (tree scalar_type, code_helper code,
3861                           tree initial_value, bool as_initial)
3862 {
3863   if (code.is_tree_code ())
3864     switch (tree_code (code))
3865       {
3866       case DOT_PROD_EXPR:
3867       case SAD_EXPR:
3868       case MINUS_EXPR:
3869       case BIT_IOR_EXPR:
3870       case BIT_XOR_EXPR:
3871         return build_zero_cst (scalar_type);
3872       case WIDEN_SUM_EXPR:
3873       case PLUS_EXPR:
3874         if (!as_initial && HONOR_SIGNED_ZEROS (scalar_type))
3875           return build_real (scalar_type, dconstm0);
3876         else
3877           return build_zero_cst (scalar_type);
3878
3879       case MULT_EXPR:
3880         return build_one_cst (scalar_type);
3881
3882       case BIT_AND_EXPR:
3883         return build_all_ones_cst (scalar_type);
3884
3885       case MAX_EXPR:
3886       case MIN_EXPR:
3887         return initial_value;
3888
3889       default:
3890         return NULL_TREE;
3891       }
3892   else
3893     switch (combined_fn (code))
3894       {
3895       CASE_CFN_FMIN:
3896       CASE_CFN_FMAX:
3897         return initial_value;
3898
3899       default:
3900         return NULL_TREE;
3901       }
3902 }
3903
3904 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
3905    STMT is printed with a message MSG. */
3906
3907 static void
3908 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3909 {
3910   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3911 }
3912
3913 /* Return true if we need an in-order reduction for operation CODE
3914    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3915    overflow must wrap.  */
3916
3917 bool
3918 needs_fold_left_reduction_p (tree type, code_helper code)
3919 {
3920   /* CHECKME: check for !flag_finite_math_only too?  */
3921   if (SCALAR_FLOAT_TYPE_P (type))
3922     {
3923       if (code.is_tree_code ())
3924         switch (tree_code (code))
3925           {
3926           case MIN_EXPR:
3927           case MAX_EXPR:
3928             return false;
3929
3930           default:
3931             return !flag_associative_math;
3932           }
3933       else
3934         switch (combined_fn (code))
3935           {
3936           CASE_CFN_FMIN:
3937           CASE_CFN_FMAX:
3938             return false;
3939
3940           default:
3941             return !flag_associative_math;
3942           }
3943     }
3944
3945   if (INTEGRAL_TYPE_P (type))
3946     return (!code.is_tree_code ()
3947             || !operation_no_trapping_overflow (type, tree_code (code)));
3948
3949   if (SAT_FIXED_POINT_TYPE_P (type))
3950     return true;
3951
3952   return false;
3953 }
3954
3955 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3956    has a handled computation expression.  Store the main reduction
3957    operation in *CODE.  */
3958
3959 static bool
3960 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3961                       tree loop_arg, code_helper *code,
3962                       vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3963 {
3964   auto_bitmap visited;
3965   tree lookfor = PHI_RESULT (phi);
3966   ssa_op_iter curri;
3967   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3968   while (USE_FROM_PTR (curr) != loop_arg)
3969     curr = op_iter_next_use (&curri);
3970   curri.i = curri.numops;
3971   do
3972     {
3973       path.safe_push (std::make_pair (curri, curr));
3974       tree use = USE_FROM_PTR (curr);
3975       if (use == lookfor)
3976         break;
3977       gimple *def = SSA_NAME_DEF_STMT (use);
3978       if (gimple_nop_p (def)
3979           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3980         {
3981 pop:
3982           do
3983             {
3984               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3985               curri = x.first;
3986               curr = x.second;
3987               do
3988                 curr = op_iter_next_use (&curri);
3989               /* Skip already visited or non-SSA operands (from iterating
3990                  over PHI args).  */
3991               while (curr != NULL_USE_OPERAND_P
3992                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3993                          || ! bitmap_set_bit (visited,
3994                                               SSA_NAME_VERSION
3995                                                 (USE_FROM_PTR (curr)))));
3996             }
3997           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3998           if (curr == NULL_USE_OPERAND_P)
3999             break;
4000         }
4001       else
4002         {
4003           if (gimple_code (def) == GIMPLE_PHI)
4004             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
4005           else
4006             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
4007           while (curr != NULL_USE_OPERAND_P
4008                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
4009                      || ! bitmap_set_bit (visited,
4010                                           SSA_NAME_VERSION
4011                                             (USE_FROM_PTR (curr)))))
4012             curr = op_iter_next_use (&curri);
4013           if (curr == NULL_USE_OPERAND_P)
4014             goto pop;
4015         }
4016     }
4017   while (1);
4018   if (dump_file && (dump_flags & TDF_DETAILS))
4019     {
4020       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
4021       unsigned i;
4022       std::pair<ssa_op_iter, use_operand_p> *x;
4023       FOR_EACH_VEC_ELT (path, i, x)
4024         dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
4025       dump_printf (MSG_NOTE, "\n");
4026     }
4027
4028   /* Check whether the reduction path detected is valid.  */
4029   bool fail = path.length () == 0;
4030   bool neg = false;
4031   int sign = -1;
4032   *code = ERROR_MARK;
4033   for (unsigned i = 1; i < path.length (); ++i)
4034     {
4035       gimple *use_stmt = USE_STMT (path[i].second);
4036       gimple_match_op op;
4037       if (!gimple_extract_op (use_stmt, &op))
4038         {
4039           fail = true;
4040           break;
4041         }
4042       unsigned int opi = op.num_ops;
4043       if (gassign *assign = dyn_cast<gassign *> (use_stmt))
4044         {
4045           /* The following make sure we can compute the operand index
4046              easily plus it mostly disallows chaining via COND_EXPR condition
4047              operands.  */
4048           for (opi = 0; opi < op.num_ops; ++opi)
4049             if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
4050               break;
4051         }
4052       else if (gcall *call = dyn_cast<gcall *> (use_stmt))
4053         {
4054           for (opi = 0; opi < op.num_ops; ++opi)
4055             if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
4056               break;
4057         }
4058       if (opi == op.num_ops)
4059         {
4060           fail = true;
4061           break;
4062         }
4063       op.code = canonicalize_code (op.code, op.type);
4064       if (op.code == MINUS_EXPR)
4065         {
4066           op.code = PLUS_EXPR;
4067           /* Track whether we negate the reduction value each iteration.  */
4068           if (op.ops[1] == op.ops[opi])
4069             neg = ! neg;
4070         }
4071       if (CONVERT_EXPR_CODE_P (op.code)
4072           && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
4073         ;
4074       else if (*code == ERROR_MARK)
4075         {
4076           *code = op.code;
4077           sign = TYPE_SIGN (op.type);
4078         }
4079       else if (op.code != *code)
4080         {
4081           fail = true;
4082           break;
4083         }
4084       else if ((op.code == MIN_EXPR
4085                 || op.code == MAX_EXPR)
4086                && sign != TYPE_SIGN (op.type))
4087         {
4088           fail = true;
4089           break;
4090         }
4091       /* Check there's only a single stmt the op is used on.  For the
4092          not value-changing tail and the last stmt allow out-of-loop uses.
4093          ???  We could relax this and handle arbitrary live stmts by
4094          forcing a scalar epilogue for example.  */
4095       imm_use_iterator imm_iter;
4096       use_operand_p use_p;
4097       gimple *op_use_stmt;
4098       unsigned cnt = 0;
4099       bool cond_fn_p = op.code.is_internal_fn ()
4100         && (conditional_internal_fn_code (internal_fn (op.code))
4101             != ERROR_MARK);
4102
4103       FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
4104         {
4105         /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
4106            op1 twice (once as definition, once as else) in the same operation.
4107            Allow this.  */
4108           if (cond_fn_p)
4109             {
4110               gcall *call = dyn_cast<gcall *> (use_stmt);
4111               unsigned else_pos
4112                 = internal_fn_else_index (internal_fn (op.code));
4113
4114               for (unsigned int j = 0; j < gimple_call_num_args (call); ++j)
4115                 {
4116                   if (j == else_pos)
4117                     continue;
4118                   if (gimple_call_arg (call, j) == op.ops[opi])
4119                     cnt++;
4120                 }
4121             }
4122           else if (!is_gimple_debug (op_use_stmt)
4123                    && (*code != ERROR_MARK
4124                        || flow_bb_inside_loop_p (loop,
4125                                                  gimple_bb (op_use_stmt))))
4126             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4127               cnt++;
4128         }
4129
4130       if (cnt != 1)
4131         {
4132           fail = true;
4133           break;
4134         }
4135     }
4136   return ! fail && ! neg && *code != ERROR_MARK;
4137 }
4138
4139 bool
4140 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
4141                       tree loop_arg, enum tree_code code)
4142 {
4143   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4144   code_helper code_;
4145   return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
4146           && code_ == code);
4147 }
4148
4149
4150
4151 /* Function vect_is_simple_reduction
4152
4153    (1) Detect a cross-iteration def-use cycle that represents a simple
4154    reduction computation.  We look for the following pattern:
4155
4156    loop_header:
4157      a1 = phi < a0, a2 >
4158      a3 = ...
4159      a2 = operation (a3, a1)
4160
4161    or
4162
4163    a3 = ...
4164    loop_header:
4165      a1 = phi < a0, a2 >
4166      a2 = operation (a3, a1)
4167
4168    such that:
4169    1. operation is commutative and associative and it is safe to
4170       change the order of the computation
4171    2. no uses for a2 in the loop (a2 is used out of the loop)
4172    3. no uses of a1 in the loop besides the reduction operation
4173    4. no uses of a1 outside the loop.
4174
4175    Conditions 1,4 are tested here.
4176    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
4177
4178    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
4179    nested cycles.
4180
4181    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
4182    reductions:
4183
4184      a1 = phi < a0, a2 >
4185      inner loop (def of a3)
4186      a2 = phi < a3 >
4187
4188    (4) Detect condition expressions, ie:
4189      for (int i = 0; i < N; i++)
4190        if (a[i] < val)
4191         ret_val = a[i];
4192
4193 */
4194
4195 static stmt_vec_info
4196 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
4197                           bool *double_reduc, bool *reduc_chain_p, bool slp)
4198 {
4199   gphi *phi = as_a <gphi *> (phi_info->stmt);
4200   gimple *phi_use_stmt = NULL;
4201   imm_use_iterator imm_iter;
4202   use_operand_p use_p;
4203
4204   *double_reduc = false;
4205   *reduc_chain_p = false;
4206   STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
4207
4208   tree phi_name = PHI_RESULT (phi);
4209   /* ???  If there are no uses of the PHI result the inner loop reduction
4210      won't be detected as possibly double-reduction by vectorizable_reduction
4211      because that tries to walk the PHI arg from the preheader edge which
4212      can be constant.  See PR60382.  */
4213   if (has_zero_uses (phi_name))
4214     return NULL;
4215   class loop *loop = (gimple_bb (phi))->loop_father;
4216   unsigned nphi_def_loop_uses = 0;
4217   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
4218     {
4219       gimple *use_stmt = USE_STMT (use_p);
4220       if (is_gimple_debug (use_stmt))
4221         continue;
4222
4223       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4224         {
4225           if (dump_enabled_p ())
4226             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4227                              "intermediate value used outside loop.\n");
4228
4229           return NULL;
4230         }
4231
4232       /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
4233          op1 twice (once as definition, once as else) in the same operation.
4234          Only count it as one. */
4235       if (use_stmt != phi_use_stmt)
4236         {
4237           nphi_def_loop_uses++;
4238           phi_use_stmt = use_stmt;
4239         }
4240     }
4241
4242   tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
4243   if (TREE_CODE (latch_def) != SSA_NAME)
4244     {
4245       if (dump_enabled_p ())
4246         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4247                          "reduction: not ssa_name: %T\n", latch_def);
4248       return NULL;
4249     }
4250
4251   stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
4252   if (!def_stmt_info
4253       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
4254     return NULL;
4255
4256   bool nested_in_vect_loop
4257     = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
4258   unsigned nlatch_def_loop_uses = 0;
4259   auto_vec<gphi *, 3> lcphis;
4260   bool inner_loop_of_double_reduc = false;
4261   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
4262     {
4263       gimple *use_stmt = USE_STMT (use_p);
4264       if (is_gimple_debug (use_stmt))
4265         continue;
4266       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4267         nlatch_def_loop_uses++;
4268       else
4269         {
4270           /* We can have more than one loop-closed PHI.  */
4271           lcphis.safe_push (as_a <gphi *> (use_stmt));
4272           if (nested_in_vect_loop
4273               && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
4274                   == vect_double_reduction_def))
4275             inner_loop_of_double_reduc = true;
4276         }
4277     }
4278
4279   /* If we are vectorizing an inner reduction we are executing that
4280      in the original order only in case we are not dealing with a
4281      double reduction.  */
4282   if (nested_in_vect_loop && !inner_loop_of_double_reduc)
4283     {
4284       if (dump_enabled_p ())
4285         report_vect_op (MSG_NOTE, def_stmt_info->stmt,
4286                         "detected nested cycle: ");
4287       return def_stmt_info;
4288     }
4289
4290   /* When the inner loop of a double reduction ends up with more than
4291      one loop-closed PHI we have failed to classify alternate such
4292      PHIs as double reduction, leading to wrong code.  See PR103237.  */
4293   if (inner_loop_of_double_reduc && lcphis.length () != 1)
4294     {
4295       if (dump_enabled_p ())
4296         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4297                          "unhandle double reduction\n");
4298       return NULL;
4299     }
4300
4301   /* If this isn't a nested cycle or if the nested cycle reduction value
4302      is used ouside of the inner loop we cannot handle uses of the reduction
4303      value.  */
4304   if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
4305     {
4306       if (dump_enabled_p ())
4307         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4308                          "reduction used in loop.\n");
4309       return NULL;
4310     }
4311
4312   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
4313      defined in the inner loop.  */
4314   if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
4315     {
4316       tree op1 = PHI_ARG_DEF (def_stmt, 0);
4317       if (gimple_phi_num_args (def_stmt) != 1
4318           || TREE_CODE (op1) != SSA_NAME)
4319         {
4320           if (dump_enabled_p ())
4321             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4322                              "unsupported phi node definition.\n");
4323
4324           return NULL;
4325         }
4326
4327       /* Verify there is an inner cycle composed of the PHI phi_use_stmt
4328          and the latch definition op1.  */
4329       gimple *def1 = SSA_NAME_DEF_STMT (op1);
4330       if (gimple_bb (def1)
4331           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4332           && loop->inner
4333           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
4334           && (is_gimple_assign (def1) || is_gimple_call (def1))
4335           && is_a <gphi *> (phi_use_stmt)
4336           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt))
4337           && (op1 == PHI_ARG_DEF_FROM_EDGE (phi_use_stmt,
4338                                             loop_latch_edge (loop->inner))))
4339         {
4340           if (dump_enabled_p ())
4341             report_vect_op (MSG_NOTE, def_stmt,
4342                             "detected double reduction: ");
4343
4344           *double_reduc = true;
4345           return def_stmt_info;
4346         }
4347
4348       return NULL;
4349     }
4350
4351   /* Look for the expression computing latch_def from then loop PHI result.  */
4352   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4353   code_helper code;
4354   if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
4355                             path))
4356     {
4357       STMT_VINFO_REDUC_CODE (phi_info) = code;
4358       if (code == COND_EXPR && !nested_in_vect_loop)
4359         STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
4360
4361       /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
4362          reduction chain for which the additional restriction is that
4363          all operations in the chain are the same.  */
4364       auto_vec<stmt_vec_info, 8> reduc_chain;
4365       unsigned i;
4366       bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
4367       for (i = path.length () - 1; i >= 1; --i)
4368         {
4369           gimple *stmt = USE_STMT (path[i].second);
4370           stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
4371           gimple_match_op op;
4372           if (!gimple_extract_op (stmt, &op))
4373             gcc_unreachable ();
4374           if (gassign *assign = dyn_cast<gassign *> (stmt))
4375             STMT_VINFO_REDUC_IDX (stmt_info)
4376               = path[i].second->use - gimple_assign_rhs1_ptr (assign);
4377           else
4378             {
4379               gcall *call = as_a<gcall *> (stmt);
4380               STMT_VINFO_REDUC_IDX (stmt_info)
4381                 = path[i].second->use - gimple_call_arg_ptr (call, 0);
4382             }
4383           bool leading_conversion = (CONVERT_EXPR_CODE_P (op.code)
4384                                      && (i == 1 || i == path.length () - 1));
4385           if ((op.code != code && !leading_conversion)
4386               /* We can only handle the final value in epilogue
4387                  generation for reduction chains.  */
4388               || (i != 1 && !has_single_use (gimple_get_lhs (stmt))))
4389             is_slp_reduc = false;
4390           /* For reduction chains we support a trailing/leading
4391              conversions.  We do not store those in the actual chain.  */
4392           if (leading_conversion)
4393             continue;
4394           reduc_chain.safe_push (stmt_info);
4395         }
4396       if (slp && is_slp_reduc && reduc_chain.length () > 1)
4397         {
4398           for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
4399             {
4400               REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
4401               REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
4402             }
4403           REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
4404           REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
4405
4406           /* Save the chain for further analysis in SLP detection.  */
4407           LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
4408           REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
4409
4410           *reduc_chain_p = true;
4411           if (dump_enabled_p ())
4412             dump_printf_loc (MSG_NOTE, vect_location,
4413                             "reduction: detected reduction chain\n");
4414         }
4415       else if (dump_enabled_p ())
4416         dump_printf_loc (MSG_NOTE, vect_location,
4417                          "reduction: detected reduction\n");
4418
4419       return def_stmt_info;
4420     }
4421
4422   if (dump_enabled_p ())
4423     dump_printf_loc (MSG_NOTE, vect_location,
4424                      "reduction: unknown pattern\n");
4425
4426   return NULL;
4427 }
4428
4429 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
4430    PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
4431    or -1 if not known.  */
4432
4433 static int
4434 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
4435 {
4436   int assumed_vf = vect_vf_for_cost (loop_vinfo);
4437   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
4438     {
4439       if (dump_enabled_p ())
4440         dump_printf_loc (MSG_NOTE, vect_location,
4441                          "cost model: epilogue peel iters set to vf/2 "
4442                          "because loop iterations are unknown .\n");
4443       return assumed_vf / 2;
4444     }
4445   else
4446     {
4447       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
4448       peel_iters_prologue = MIN (niters, peel_iters_prologue);
4449       int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
4450       /* If we need to peel for gaps, but no peeling is required, we have to
4451          peel VF iterations.  */
4452       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
4453         peel_iters_epilogue = assumed_vf;
4454       return peel_iters_epilogue;
4455     }
4456 }
4457
4458 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
4459 int
4460 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
4461                              int *peel_iters_epilogue,
4462                              stmt_vector_for_cost *scalar_cost_vec,
4463                              stmt_vector_for_cost *prologue_cost_vec,
4464                              stmt_vector_for_cost *epilogue_cost_vec)
4465 {
4466   int retval = 0;
4467
4468   *peel_iters_epilogue
4469     = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
4470
4471   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
4472     {
4473       /* If peeled iterations are known but number of scalar loop
4474          iterations are unknown, count a taken branch per peeled loop.  */
4475       if (peel_iters_prologue > 0)
4476         retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
4477                                    vect_prologue);
4478       if (*peel_iters_epilogue > 0)
4479         retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
4480                                     vect_epilogue);
4481     }
4482
4483   stmt_info_for_cost *si;
4484   int j;
4485   if (peel_iters_prologue)
4486     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4487       retval += record_stmt_cost (prologue_cost_vec,
4488                                   si->count * peel_iters_prologue,
4489                                   si->kind, si->stmt_info, si->misalign,
4490                                   vect_prologue);
4491   if (*peel_iters_epilogue)
4492     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4493       retval += record_stmt_cost (epilogue_cost_vec,
4494                                   si->count * *peel_iters_epilogue,
4495                                   si->kind, si->stmt_info, si->misalign,
4496                                   vect_epilogue);
4497
4498   return retval;
4499 }
4500
4501 /* Function vect_estimate_min_profitable_iters
4502
4503    Return the number of iterations required for the vector version of the
4504    loop to be profitable relative to the cost of the scalar version of the
4505    loop.
4506
4507    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
4508    of iterations for vectorization.  -1 value means loop vectorization
4509    is not profitable.  This returned value may be used for dynamic
4510    profitability check.
4511
4512    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
4513    for static check against estimated number of iterations.  */
4514
4515 static void
4516 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
4517                                     int *ret_min_profitable_niters,
4518                                     int *ret_min_profitable_estimate,
4519                                     unsigned *suggested_unroll_factor)
4520 {
4521   int min_profitable_iters;
4522   int min_profitable_estimate;
4523   int peel_iters_prologue;
4524   int peel_iters_epilogue;
4525   unsigned vec_inside_cost = 0;
4526   int vec_outside_cost = 0;
4527   unsigned vec_prologue_cost = 0;
4528   unsigned vec_epilogue_cost = 0;
4529   int scalar_single_iter_cost = 0;
4530   int scalar_outside_cost = 0;
4531   int assumed_vf = vect_vf_for_cost (loop_vinfo);
4532   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
4533   vector_costs *target_cost_data = loop_vinfo->vector_costs;
4534
4535   /* Cost model disabled.  */
4536   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
4537     {
4538       if (dump_enabled_p ())
4539         dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
4540       *ret_min_profitable_niters = 0;
4541       *ret_min_profitable_estimate = 0;
4542       return;
4543     }
4544
4545   /* Requires loop versioning tests to handle misalignment.  */
4546   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
4547     {
4548       /*  FIXME: Make cost depend on complexity of individual check.  */
4549       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
4550       (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4551       if (dump_enabled_p ())
4552         dump_printf (MSG_NOTE,
4553                      "cost model: Adding cost of checks for loop "
4554                      "versioning to treat misalignment.\n");
4555     }
4556
4557   /* Requires loop versioning with alias checks.  */
4558   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4559     {
4560       /*  FIXME: Make cost depend on complexity of individual check.  */
4561       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
4562       (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4563       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
4564       if (len)
4565         /* Count LEN - 1 ANDs and LEN comparisons.  */
4566         (void) add_stmt_cost (target_cost_data, len * 2 - 1,
4567                               scalar_stmt, vect_prologue);
4568       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
4569       if (len)
4570         {
4571           /* Count LEN - 1 ANDs and LEN comparisons.  */
4572           unsigned int nstmts = len * 2 - 1;
4573           /* +1 for each bias that needs adding.  */
4574           for (unsigned int i = 0; i < len; ++i)
4575             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
4576               nstmts += 1;
4577           (void) add_stmt_cost (target_cost_data, nstmts,
4578                                 scalar_stmt, vect_prologue);
4579         }
4580       if (dump_enabled_p ())
4581         dump_printf (MSG_NOTE,
4582                      "cost model: Adding cost of checks for loop "
4583                      "versioning aliasing.\n");
4584     }
4585
4586   /* Requires loop versioning with niter checks.  */
4587   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
4588     {
4589       /*  FIXME: Make cost depend on complexity of individual check.  */
4590       (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
4591                             NULL, NULL, NULL_TREE, 0, vect_prologue);
4592       if (dump_enabled_p ())
4593         dump_printf (MSG_NOTE,
4594                      "cost model: Adding cost of checks for loop "
4595                      "versioning niters.\n");
4596     }
4597
4598   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4599     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4600                           vect_prologue);
4601
4602   /* Count statements in scalar loop.  Using this as scalar cost for a single
4603      iteration for now.
4604
4605      TODO: Add outer loop support.
4606
4607      TODO: Consider assigning different costs to different scalar
4608      statements.  */
4609
4610   scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost ();
4611
4612   /* Add additional cost for the peeled instructions in prologue and epilogue
4613      loop.  (For fully-masked loops there will be no peeling.)
4614
4615      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4616      at compile-time - we assume it's vf/2 (the worst would be vf-1).
4617
4618      TODO: Build an expression that represents peel_iters for prologue and
4619      epilogue to be used in a run-time test.  */
4620
4621   bool prologue_need_br_taken_cost = false;
4622   bool prologue_need_br_not_taken_cost = false;
4623
4624   /* Calculate peel_iters_prologue.  */
4625   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4626     peel_iters_prologue = 0;
4627   else if (npeel < 0)
4628     {
4629       peel_iters_prologue = assumed_vf / 2;
4630       if (dump_enabled_p ())
4631         dump_printf (MSG_NOTE, "cost model: "
4632                      "prologue peel iters set to vf/2.\n");
4633
4634       /* If peeled iterations are unknown, count a taken branch and a not taken
4635          branch per peeled loop.  Even if scalar loop iterations are known,
4636          vector iterations are not known since peeled prologue iterations are
4637          not known.  Hence guards remain the same.  */
4638       prologue_need_br_taken_cost = true;
4639       prologue_need_br_not_taken_cost = true;
4640     }
4641   else
4642     {
4643       peel_iters_prologue = npeel;
4644       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4645         /* If peeled iterations are known but number of scalar loop
4646            iterations are unknown, count a taken branch per peeled loop.  */
4647         prologue_need_br_taken_cost = true;
4648     }
4649
4650   bool epilogue_need_br_taken_cost = false;
4651   bool epilogue_need_br_not_taken_cost = false;
4652
4653   /* Calculate peel_iters_epilogue.  */
4654   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4655     /* We need to peel exactly one iteration for gaps.  */
4656     peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4657   else if (npeel < 0)
4658     {
4659       /* If peeling for alignment is unknown, loop bound of main loop
4660          becomes unknown.  */
4661       peel_iters_epilogue = assumed_vf / 2;
4662       if (dump_enabled_p ())
4663         dump_printf (MSG_NOTE, "cost model: "
4664                      "epilogue peel iters set to vf/2 because "
4665                      "peeling for alignment is unknown.\n");
4666
4667       /* See the same reason above in peel_iters_prologue calculation.  */
4668       epilogue_need_br_taken_cost = true;
4669       epilogue_need_br_not_taken_cost = true;
4670     }
4671   else
4672     {
4673       peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4674       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4675         /* If peeled iterations are known but number of scalar loop
4676            iterations are unknown, count a taken branch per peeled loop.  */
4677         epilogue_need_br_taken_cost = true;
4678     }
4679
4680   stmt_info_for_cost *si;
4681   int j;
4682   /* Add costs associated with peel_iters_prologue.  */
4683   if (peel_iters_prologue)
4684     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4685       {
4686         (void) add_stmt_cost (target_cost_data,
4687                               si->count * peel_iters_prologue, si->kind,
4688                               si->stmt_info, si->node, si->vectype,
4689                               si->misalign, vect_prologue);
4690       }
4691
4692   /* Add costs associated with peel_iters_epilogue.  */
4693   if (peel_iters_epilogue)
4694     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4695       {
4696         (void) add_stmt_cost (target_cost_data,
4697                               si->count * peel_iters_epilogue, si->kind,
4698                               si->stmt_info, si->node, si->vectype,
4699                               si->misalign, vect_epilogue);
4700       }
4701
4702   /* Add possible cond_branch_taken/cond_branch_not_taken cost.  */
4703
4704   if (prologue_need_br_taken_cost)
4705     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4706                           vect_prologue);
4707
4708   if (prologue_need_br_not_taken_cost)
4709     (void) add_stmt_cost (target_cost_data, 1,
4710                           cond_branch_not_taken, vect_prologue);
4711
4712   if (epilogue_need_br_taken_cost)
4713     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4714                           vect_epilogue);
4715
4716   if (epilogue_need_br_not_taken_cost)
4717     (void) add_stmt_cost (target_cost_data, 1,
4718                           cond_branch_not_taken, vect_epilogue);
4719
4720   /* Take care of special costs for rgroup controls of partial vectors.  */
4721   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4722       && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4723           == vect_partial_vectors_avx512))
4724     {
4725       /* Calculate how many masks we need to generate.  */
4726       unsigned int num_masks = 0;
4727       bool need_saturation = false;
4728       for (auto rgm : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
4729         if (rgm.type)
4730           {
4731             unsigned nvectors = rgm.factor;
4732             num_masks += nvectors;
4733             if (TYPE_PRECISION (TREE_TYPE (rgm.compare_type))
4734                 < TYPE_PRECISION (LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo)))
4735               need_saturation = true;
4736           }
4737
4738       /* ???  The target isn't able to identify the costs below as
4739          producing masks so it cannot penaltize cases where we'd run
4740          out of mask registers for example.  */
4741
4742       /* ???  We are also failing to account for smaller vector masks
4743          we generate by splitting larger masks in vect_get_loop_mask.  */
4744
4745       /* In the worst case, we need to generate each mask in the prologue
4746          and in the loop body.  We need one splat per group and one
4747          compare per mask.
4748
4749          Sometimes the prologue mask will fold to a constant,
4750          so the actual prologue cost might be smaller.  However, it's
4751          simpler and safer to use the worst-case cost; if this ends up
4752          being the tie-breaker between vectorizing or not, then it's
4753          probably better not to vectorize.  */
4754       (void) add_stmt_cost (target_cost_data,
4755                             num_masks
4756                             + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4757                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4758                             vect_prologue);
4759       (void) add_stmt_cost (target_cost_data,
4760                             num_masks
4761                             + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4762                             vector_stmt, NULL, NULL, NULL_TREE, 0, vect_body);
4763
4764       /* When we need saturation we need it both in the prologue and
4765          the epilogue.  */
4766       if (need_saturation)
4767         {
4768           (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4769                                 NULL, NULL, NULL_TREE, 0, vect_prologue);
4770           (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4771                                 NULL, NULL, NULL_TREE, 0, vect_body);
4772         }
4773     }
4774   else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4775            && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4776                == vect_partial_vectors_while_ult))
4777     {
4778       /* Calculate how many masks we need to generate.  */
4779       unsigned int num_masks = 0;
4780       rgroup_controls *rgm;
4781       unsigned int num_vectors_m1;
4782       FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec,
4783                         num_vectors_m1, rgm)
4784         if (rgm->type)
4785           num_masks += num_vectors_m1 + 1;
4786       gcc_assert (num_masks > 0);
4787
4788       /* In the worst case, we need to generate each mask in the prologue
4789          and in the loop body.  One of the loop body mask instructions
4790          replaces the comparison in the scalar loop, and since we don't
4791          count the scalar comparison against the scalar body, we shouldn't
4792          count that vector instruction against the vector body either.
4793
4794          Sometimes we can use unpacks instead of generating prologue
4795          masks and sometimes the prologue mask will fold to a constant,
4796          so the actual prologue cost might be smaller.  However, it's
4797          simpler and safer to use the worst-case cost; if this ends up
4798          being the tie-breaker between vectorizing or not, then it's
4799          probably better not to vectorize.  */
4800       (void) add_stmt_cost (target_cost_data, num_masks,
4801                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4802                             vect_prologue);
4803       (void) add_stmt_cost (target_cost_data, num_masks - 1,
4804                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4805                             vect_body);
4806     }
4807   else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4808     {
4809       /* Referring to the functions vect_set_loop_condition_partial_vectors
4810          and vect_set_loop_controls_directly, we need to generate each
4811          length in the prologue and in the loop body if required. Although
4812          there are some possible optimizations, we consider the worst case
4813          here.  */
4814
4815       bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4816       signed char partial_load_store_bias
4817         = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4818       bool need_iterate_p
4819         = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4820            && !vect_known_niters_smaller_than_vf (loop_vinfo));
4821
4822       /* Calculate how many statements to be added.  */
4823       unsigned int prologue_stmts = 0;
4824       unsigned int body_stmts = 0;
4825
4826       rgroup_controls *rgc;
4827       unsigned int num_vectors_m1;
4828       FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4829         if (rgc->type)
4830           {
4831             /* May need one SHIFT for nitems_total computation.  */
4832             unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4833             if (nitems != 1 && !niters_known_p)
4834               prologue_stmts += 1;
4835
4836             /* May need one MAX and one MINUS for wrap around.  */
4837             if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4838               prologue_stmts += 2;
4839
4840             /* Need one MAX and one MINUS for each batch limit excepting for
4841                the 1st one.  */
4842             prologue_stmts += num_vectors_m1 * 2;
4843
4844             unsigned int num_vectors = num_vectors_m1 + 1;
4845
4846             /* Need to set up lengths in prologue, only one MIN required
4847                for each since start index is zero.  */
4848             prologue_stmts += num_vectors;
4849
4850             /* If we have a non-zero partial load bias, we need one PLUS
4851                to adjust the load length.  */
4852             if (partial_load_store_bias != 0)
4853               body_stmts += 1;
4854
4855             /* Each may need two MINs and one MINUS to update lengths in body
4856                for next iteration.  */
4857             if (need_iterate_p)
4858               body_stmts += 3 * num_vectors;
4859           }
4860
4861       (void) add_stmt_cost (target_cost_data, prologue_stmts,
4862                             scalar_stmt, vect_prologue);
4863       (void) add_stmt_cost (target_cost_data, body_stmts,
4864                             scalar_stmt, vect_body);
4865     }
4866
4867   /* FORNOW: The scalar outside cost is incremented in one of the
4868      following ways:
4869
4870      1. The vectorizer checks for alignment and aliasing and generates
4871      a condition that allows dynamic vectorization.  A cost model
4872      check is ANDED with the versioning condition.  Hence scalar code
4873      path now has the added cost of the versioning check.
4874
4875        if (cost > th & versioning_check)
4876          jmp to vector code
4877
4878      Hence run-time scalar is incremented by not-taken branch cost.
4879
4880      2. The vectorizer then checks if a prologue is required.  If the
4881      cost model check was not done before during versioning, it has to
4882      be done before the prologue check.
4883
4884        if (cost <= th)
4885          prologue = scalar_iters
4886        if (prologue == 0)
4887          jmp to vector code
4888        else
4889          execute prologue
4890        if (prologue == num_iters)
4891          go to exit
4892
4893      Hence the run-time scalar cost is incremented by a taken branch,
4894      plus a not-taken branch, plus a taken branch cost.
4895
4896      3. The vectorizer then checks if an epilogue is required.  If the
4897      cost model check was not done before during prologue check, it
4898      has to be done with the epilogue check.
4899
4900        if (prologue == 0)
4901          jmp to vector code
4902        else
4903          execute prologue
4904        if (prologue == num_iters)
4905          go to exit
4906        vector code:
4907          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4908            jmp to epilogue
4909
4910      Hence the run-time scalar cost should be incremented by 2 taken
4911      branches.
4912
4913      TODO: The back end may reorder the BBS's differently and reverse
4914      conditions/branch directions.  Change the estimates below to
4915      something more reasonable.  */
4916
4917   /* If the number of iterations is known and we do not do versioning, we can
4918      decide whether to vectorize at compile time.  Hence the scalar version
4919      do not carry cost model guard costs.  */
4920   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4921       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4922     {
4923       /* Cost model check occurs at versioning.  */
4924       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4925         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4926       else
4927         {
4928           /* Cost model check occurs at prologue generation.  */
4929           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4930             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4931               + vect_get_stmt_cost (cond_branch_not_taken);
4932           /* Cost model check occurs at epilogue generation.  */
4933           else
4934             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4935         }
4936     }
4937
4938   /* Complete the target-specific cost calculations.  */
4939   finish_cost (loop_vinfo->vector_costs, loop_vinfo->scalar_costs,
4940                &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost,
4941                suggested_unroll_factor);
4942
4943   if (suggested_unroll_factor && *suggested_unroll_factor > 1
4944       && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
4945       && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
4946                     *suggested_unroll_factor,
4947                     LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
4948     {
4949       if (dump_enabled_p ())
4950         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4951                          "can't unroll as unrolled vectorization factor larger"
4952                          " than maximum vectorization factor: "
4953                          HOST_WIDE_INT_PRINT_UNSIGNED "\n",
4954                          LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
4955       *suggested_unroll_factor = 1;
4956     }
4957
4958   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4959
4960   if (dump_enabled_p ())
4961     {
4962       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4963       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
4964                    vec_inside_cost);
4965       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
4966                    vec_prologue_cost);
4967       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
4968                    vec_epilogue_cost);
4969       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
4970                    scalar_single_iter_cost);
4971       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
4972                    scalar_outside_cost);
4973       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
4974                    vec_outside_cost);
4975       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
4976                    peel_iters_prologue);
4977       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
4978                    peel_iters_epilogue);
4979     }
4980
4981   /* Calculate number of iterations required to make the vector version
4982      profitable, relative to the loop bodies only.  The following condition
4983      must hold true:
4984      SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4985      where
4986      SIC = scalar iteration cost, VIC = vector iteration cost,
4987      VOC = vector outside cost, VF = vectorization factor,
4988      NPEEL = prologue iterations + epilogue iterations,
4989      SOC = scalar outside cost for run time cost model check.  */
4990
4991   int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4992                           - vec_inside_cost);
4993   if (saving_per_viter <= 0)
4994     {
4995       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4996         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4997                     "vectorization did not happen for a simd loop");
4998
4999       if (dump_enabled_p ())
5000         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5001                          "cost model: the vector iteration cost = %d "
5002                          "divided by the scalar iteration cost = %d "
5003                          "is greater or equal to the vectorization factor = %d"
5004                          ".\n",
5005                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
5006       *ret_min_profitable_niters = -1;
5007       *ret_min_profitable_estimate = -1;
5008       return;
5009     }
5010
5011   /* ??? The "if" arm is written to handle all cases; see below for what
5012      we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
5013   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5014     {
5015       /* Rewriting the condition above in terms of the number of
5016          vector iterations (vniters) rather than the number of
5017          scalar iterations (niters) gives:
5018
5019          SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
5020
5021          <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
5022
5023          For integer N, X and Y when X > 0:
5024
5025          N * X > Y <==> N >= (Y /[floor] X) + 1.  */
5026       int outside_overhead = (vec_outside_cost
5027                               - scalar_single_iter_cost * peel_iters_prologue
5028                               - scalar_single_iter_cost * peel_iters_epilogue
5029                               - scalar_outside_cost);
5030       /* We're only interested in cases that require at least one
5031          vector iteration.  */
5032       int min_vec_niters = 1;
5033       if (outside_overhead > 0)
5034         min_vec_niters = outside_overhead / saving_per_viter + 1;
5035
5036       if (dump_enabled_p ())
5037         dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
5038                      min_vec_niters);
5039
5040       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5041         {
5042           /* Now that we know the minimum number of vector iterations,
5043              find the minimum niters for which the scalar cost is larger:
5044
5045              SIC * niters > VIC * vniters + VOC - SOC
5046
5047              We know that the minimum niters is no more than
5048              vniters * VF + NPEEL, but it might be (and often is) less
5049              than that if a partial vector iteration is cheaper than the
5050              equivalent scalar code.  */
5051           int threshold = (vec_inside_cost * min_vec_niters
5052                            + vec_outside_cost
5053                            - scalar_outside_cost);
5054           if (threshold <= 0)
5055             min_profitable_iters = 1;
5056           else
5057             min_profitable_iters = threshold / scalar_single_iter_cost + 1;
5058         }
5059       else
5060         /* Convert the number of vector iterations into a number of
5061            scalar iterations.  */
5062         min_profitable_iters = (min_vec_niters * assumed_vf
5063                                 + peel_iters_prologue
5064                                 + peel_iters_epilogue);
5065     }
5066   else
5067     {
5068       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
5069                               * assumed_vf
5070                               - vec_inside_cost * peel_iters_prologue
5071                               - vec_inside_cost * peel_iters_epilogue);
5072       if (min_profitable_iters <= 0)
5073         min_profitable_iters = 0;
5074       else
5075         {
5076           min_profitable_iters /= saving_per_viter;
5077
5078           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
5079               <= (((int) vec_inside_cost * min_profitable_iters)
5080                   + (((int) vec_outside_cost - scalar_outside_cost)
5081                      * assumed_vf)))
5082             min_profitable_iters++;
5083         }
5084     }
5085
5086   if (dump_enabled_p ())
5087     dump_printf (MSG_NOTE,
5088                  "  Calculated minimum iters for profitability: %d\n",
5089                  min_profitable_iters);
5090
5091   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
5092       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
5093     /* We want the vectorized loop to execute at least once.  */
5094     min_profitable_iters = assumed_vf + peel_iters_prologue;
5095   else if (min_profitable_iters < peel_iters_prologue)
5096     /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
5097        vectorized loop executes at least once.  */
5098     min_profitable_iters = peel_iters_prologue;
5099
5100   if (dump_enabled_p ())
5101     dump_printf_loc (MSG_NOTE, vect_location,
5102                      "  Runtime profitability threshold = %d\n",
5103                      min_profitable_iters);
5104
5105   *ret_min_profitable_niters = min_profitable_iters;
5106
5107   /* Calculate number of iterations required to make the vector version
5108      profitable, relative to the loop bodies only.
5109
5110      Non-vectorized variant is SIC * niters and it must win over vector
5111      variant on the expected loop trip count.  The following condition must hold true:
5112      SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
5113
5114   if (vec_outside_cost <= 0)
5115     min_profitable_estimate = 0;
5116   /* ??? This "else if" arm is written to handle all cases; see below for
5117      what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
5118   else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5119     {
5120       /* This is a repeat of the code above, but with + SOC rather
5121          than - SOC.  */
5122       int outside_overhead = (vec_outside_cost
5123                               - scalar_single_iter_cost * peel_iters_prologue
5124                               - scalar_single_iter_cost * peel_iters_epilogue
5125                               + scalar_outside_cost);
5126       int min_vec_niters = 1;
5127       if (outside_overhead > 0)
5128         min_vec_niters = outside_overhead / saving_per_viter + 1;
5129
5130       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5131         {
5132           int threshold = (vec_inside_cost * min_vec_niters
5133                            + vec_outside_cost
5134                            + scalar_outside_cost);
5135           min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
5136         }
5137       else
5138         min_profitable_estimate = (min_vec_niters * assumed_vf
5139                                    + peel_iters_prologue
5140                                    + peel_iters_epilogue);
5141     }
5142   else
5143     {
5144       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
5145                                  * assumed_vf
5146                                  - vec_inside_cost * peel_iters_prologue
5147                                  - vec_inside_cost * peel_iters_epilogue)
5148                                  / ((scalar_single_iter_cost * assumed_vf)
5149                                    - vec_inside_cost);
5150     }
5151   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
5152   if (dump_enabled_p ())
5153     dump_printf_loc (MSG_NOTE, vect_location,
5154                      "  Static estimate profitability threshold = %d\n",
5155                      min_profitable_estimate);
5156
5157   *ret_min_profitable_estimate = min_profitable_estimate;
5158 }
5159
5160 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
5161    vector elements (not bits) for a vector with NELT elements.  */
5162 static void
5163 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
5164                               vec_perm_builder *sel)
5165 {
5166   /* The encoding is a single stepped pattern.  Any wrap-around is handled
5167      by vec_perm_indices.  */
5168   sel->new_vector (nelt, 1, 3);
5169   for (unsigned int i = 0; i < 3; i++)
5170     sel->quick_push (i + offset);
5171 }
5172
5173 /* Checks whether the target supports whole-vector shifts for vectors of mode
5174    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
5175    it supports vec_perm_const with masks for all necessary shift amounts.  */
5176 static bool
5177 have_whole_vector_shift (machine_mode mode)
5178 {
5179   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
5180     return true;
5181
5182   /* Variable-length vectors should be handled via the optab.  */
5183   unsigned int nelt;
5184   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5185     return false;
5186
5187   vec_perm_builder sel;
5188   vec_perm_indices indices;
5189   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
5190     {
5191       calc_vec_perm_mask_for_shift (i, nelt, &sel);
5192       indices.new_vector (sel, 2, nelt);
5193       if (!can_vec_perm_const_p (mode, mode, indices, false))
5194         return false;
5195     }
5196   return true;
5197 }
5198
5199 /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
5200    multiplication operands have differing signs and (b) we intend
5201    to emulate the operation using a series of signed DOT_PROD_EXPRs.
5202    See vect_emulate_mixed_dot_prod for the actual sequence used.  */
5203
5204 static bool
5205 vect_is_emulated_mixed_dot_prod (loop_vec_info loop_vinfo,
5206                                  stmt_vec_info stmt_info)
5207 {
5208   gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
5209   if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
5210     return false;
5211
5212   tree rhs1 = gimple_assign_rhs1 (assign);
5213   tree rhs2 = gimple_assign_rhs2 (assign);
5214   if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
5215     return false;
5216
5217   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5218   gcc_assert (reduc_info->is_reduc_info);
5219   return !directly_supported_p (DOT_PROD_EXPR,
5220                                 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info),
5221                                 optab_vector_mixed_sign);
5222 }
5223
5224 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
5225    functions. Design better to avoid maintenance issues.  */
5226
5227 /* Function vect_model_reduction_cost.
5228
5229    Models cost for a reduction operation, including the vector ops
5230    generated within the strip-mine loop in some cases, the initial
5231    definition before the loop, and the epilogue code that must be generated.  */
5232
5233 static void
5234 vect_model_reduction_cost (loop_vec_info loop_vinfo,
5235                            stmt_vec_info stmt_info, internal_fn reduc_fn,
5236                            vect_reduction_type reduction_type,
5237                            int ncopies, stmt_vector_for_cost *cost_vec)
5238 {
5239   int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
5240   tree vectype;
5241   machine_mode mode;
5242   class loop *loop = NULL;
5243
5244   if (loop_vinfo)
5245     loop = LOOP_VINFO_LOOP (loop_vinfo);
5246
5247   /* Condition reductions generate two reductions in the loop.  */
5248   if (reduction_type == COND_REDUCTION)
5249     ncopies *= 2;
5250
5251   vectype = STMT_VINFO_VECTYPE (stmt_info);
5252   mode = TYPE_MODE (vectype);
5253   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5254
5255   gimple_match_op op;
5256   if (!gimple_extract_op (orig_stmt_info->stmt, &op))
5257     gcc_unreachable ();
5258
5259   bool emulated_mixed_dot_prod
5260     = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
5261   if (reduction_type == EXTRACT_LAST_REDUCTION)
5262     /* No extra instructions are needed in the prologue.  The loop body
5263        operations are costed in vectorizable_condition.  */
5264     inside_cost = 0;
5265   else if (reduction_type == FOLD_LEFT_REDUCTION)
5266     {
5267       /* No extra instructions needed in the prologue.  */
5268       prologue_cost = 0;
5269
5270       if (reduc_fn != IFN_LAST)
5271         /* Count one reduction-like operation per vector.  */
5272         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
5273                                         stmt_info, 0, vect_body);
5274       else
5275         {
5276           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
5277           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
5278           inside_cost = record_stmt_cost (cost_vec, nelements,
5279                                           vec_to_scalar, stmt_info, 0,
5280                                           vect_body);
5281           inside_cost += record_stmt_cost (cost_vec, nelements,
5282                                            scalar_stmt, stmt_info, 0,
5283                                            vect_body);
5284         }
5285     }
5286   else
5287     {
5288       /* Add in the cost of the initial definitions.  */
5289       int prologue_stmts;
5290       if (reduction_type == COND_REDUCTION)
5291         /* For cond reductions we have four vectors: initial index, step,
5292            initial result of the data reduction, initial value of the index
5293            reduction.  */
5294         prologue_stmts = 4;
5295       else if (emulated_mixed_dot_prod)
5296         /* We need the initial reduction value and two invariants:
5297            one that contains the minimum signed value and one that
5298            contains half of its negative.  */
5299         prologue_stmts = 3;
5300       else
5301         prologue_stmts = 1;
5302       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
5303                                          scalar_to_vec, stmt_info, 0,
5304                                          vect_prologue);
5305     }
5306
5307   /* Determine cost of epilogue code.
5308
5309      We have a reduction operator that will reduce the vector in one statement.
5310      Also requires scalar extract.  */
5311
5312   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
5313     {
5314       if (reduc_fn != IFN_LAST)
5315         {
5316           if (reduction_type == COND_REDUCTION)
5317             {
5318               /* An EQ stmt and an COND_EXPR stmt.  */
5319               epilogue_cost += record_stmt_cost (cost_vec, 2,
5320                                                  vector_stmt, stmt_info, 0,
5321                                                  vect_epilogue);
5322               /* Reduction of the max index and a reduction of the found
5323                  values.  */
5324               epilogue_cost += record_stmt_cost (cost_vec, 2,
5325                                                  vec_to_scalar, stmt_info, 0,
5326                                                  vect_epilogue);
5327               /* A broadcast of the max value.  */
5328               epilogue_cost += record_stmt_cost (cost_vec, 1,
5329                                                  scalar_to_vec, stmt_info, 0,
5330                                                  vect_epilogue);
5331             }
5332           else
5333             {
5334               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
5335                                                  stmt_info, 0, vect_epilogue);
5336               epilogue_cost += record_stmt_cost (cost_vec, 1,
5337                                                  vec_to_scalar, stmt_info, 0,
5338                                                  vect_epilogue);
5339             }
5340         }
5341       else if (reduction_type == COND_REDUCTION)
5342         {
5343           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
5344           /* Extraction of scalar elements.  */
5345           epilogue_cost += record_stmt_cost (cost_vec,
5346                                              2 * estimated_nunits,
5347                                              vec_to_scalar, stmt_info, 0,
5348                                              vect_epilogue);
5349           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
5350           epilogue_cost += record_stmt_cost (cost_vec,
5351                                              2 * estimated_nunits - 3,
5352                                              scalar_stmt, stmt_info, 0,
5353                                              vect_epilogue);
5354         }
5355       else if (reduction_type == EXTRACT_LAST_REDUCTION
5356                || reduction_type == FOLD_LEFT_REDUCTION)
5357         /* No extra instructions need in the epilogue.  */
5358         ;
5359       else
5360         {
5361           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5362           tree bitsize = TYPE_SIZE (op.type);
5363           int element_bitsize = tree_to_uhwi (bitsize);
5364           int nelements = vec_size_in_bits / element_bitsize;
5365
5366           if (op.code == COND_EXPR)
5367             op.code = MAX_EXPR;
5368
5369           /* We have a whole vector shift available.  */
5370           if (VECTOR_MODE_P (mode)
5371               && directly_supported_p (op.code, vectype)
5372               && have_whole_vector_shift (mode))
5373             {
5374               /* Final reduction via vector shifts and the reduction operator.
5375                  Also requires scalar extract.  */
5376               epilogue_cost += record_stmt_cost (cost_vec,
5377                                                  exact_log2 (nelements) * 2,
5378                                                  vector_stmt, stmt_info, 0,
5379                                                  vect_epilogue);
5380               epilogue_cost += record_stmt_cost (cost_vec, 1,
5381                                                  vec_to_scalar, stmt_info, 0,
5382                                                  vect_epilogue);
5383             }
5384           else
5385             /* Use extracts and reduction op for final reduction.  For N
5386                elements, we have N extracts and N-1 reduction ops.  */
5387             epilogue_cost += record_stmt_cost (cost_vec,
5388                                                nelements + nelements - 1,
5389                                                vector_stmt, stmt_info, 0,
5390                                                vect_epilogue);
5391         }
5392     }
5393
5394   if (dump_enabled_p ())
5395     dump_printf (MSG_NOTE,
5396                  "vect_model_reduction_cost: inside_cost = %d, "
5397                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
5398                  prologue_cost, epilogue_cost);
5399 }
5400
5401 /* SEQ is a sequence of instructions that initialize the reduction
5402    described by REDUC_INFO.  Emit them in the appropriate place.  */
5403
5404 static void
5405 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
5406                                 stmt_vec_info reduc_info, gimple *seq)
5407 {
5408   if (reduc_info->reused_accumulator)
5409     {
5410       /* When reusing an accumulator from the main loop, we only need
5411          initialization instructions if the main loop can be skipped.
5412          In that case, emit the initialization instructions at the end
5413          of the guard block that does the skip.  */
5414       edge skip_edge = loop_vinfo->skip_main_loop_edge;
5415       gcc_assert (skip_edge);
5416       gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
5417       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
5418     }
5419   else
5420     {
5421       /* The normal case: emit the initialization instructions on the
5422          preheader edge.  */
5423       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5424       gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
5425     }
5426 }
5427
5428 /* Function get_initial_def_for_reduction
5429
5430    Input:
5431    REDUC_INFO - the info_for_reduction
5432    INIT_VAL - the initial value of the reduction variable
5433    NEUTRAL_OP - a value that has no effect on the reduction, as per
5434                 neutral_op_for_reduction
5435
5436    Output:
5437    Return a vector variable, initialized according to the operation that
5438         STMT_VINFO performs. This vector will be used as the initial value
5439         of the vector of partial results.
5440
5441    The value we need is a vector in which element 0 has value INIT_VAL
5442    and every other element has value NEUTRAL_OP.  */
5443
5444 static tree
5445 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
5446                                stmt_vec_info reduc_info,
5447                                tree init_val, tree neutral_op)
5448 {
5449   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5450   tree scalar_type = TREE_TYPE (init_val);
5451   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
5452   tree init_def;
5453   gimple_seq stmts = NULL;
5454
5455   gcc_assert (vectype);
5456
5457   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
5458               || SCALAR_FLOAT_TYPE_P (scalar_type));
5459
5460   gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
5461               || loop == (gimple_bb (reduc_info->stmt))->loop_father);
5462
5463   if (operand_equal_p (init_val, neutral_op))
5464     {
5465       /* If both elements are equal then the vector described above is
5466          just a splat.  */
5467       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5468       init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
5469     }
5470   else
5471     {
5472       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5473       init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
5474       if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
5475         {
5476           /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
5477              element 0.  */
5478           init_def = gimple_build_vector_from_val (&stmts, vectype,
5479                                                    neutral_op);
5480           init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
5481                                    vectype, init_def, init_val);
5482         }
5483       else
5484         {
5485           /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}.  */
5486           tree_vector_builder elts (vectype, 1, 2);
5487           elts.quick_push (init_val);
5488           elts.quick_push (neutral_op);
5489           init_def = gimple_build_vector (&stmts, &elts);
5490         }
5491     }
5492
5493   if (stmts)
5494     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
5495   return init_def;
5496 }
5497
5498 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
5499    which performs a reduction involving GROUP_SIZE scalar statements.
5500    NUMBER_OF_VECTORS is the number of vector defs to create.  If NEUTRAL_OP
5501    is nonnull, introducing extra elements of that value will not change the
5502    result.  */
5503
5504 static void
5505 get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
5506                                 stmt_vec_info reduc_info,
5507                                 vec<tree> *vec_oprnds,
5508                                 unsigned int number_of_vectors,
5509                                 unsigned int group_size, tree neutral_op)
5510 {
5511   vec<tree> &initial_values = reduc_info->reduc_initial_values;
5512   unsigned HOST_WIDE_INT nunits;
5513   unsigned j, number_of_places_left_in_vector;
5514   tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
5515   unsigned int i;
5516
5517   gcc_assert (group_size == initial_values.length () || neutral_op);
5518
5519   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5520      created vectors. It is greater than 1 if unrolling is performed.
5521
5522      For example, we have two scalar operands, s1 and s2 (e.g., group of
5523      strided accesses of size two), while NUNITS is four (i.e., four scalars
5524      of this type can be packed in a vector).  The output vector will contain
5525      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
5526      will be 2).
5527
5528      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
5529      vectors containing the operands.
5530
5531      For example, NUNITS is four as before, and the group size is 8
5532      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
5533      {s5, s6, s7, s8}.  */
5534
5535   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
5536     nunits = group_size;
5537
5538   number_of_places_left_in_vector = nunits;
5539   bool constant_p = true;
5540   tree_vector_builder elts (vector_type, nunits, 1);
5541   elts.quick_grow (nunits);
5542   gimple_seq ctor_seq = NULL;
5543   for (j = 0; j < nunits * number_of_vectors; ++j)
5544     {
5545       tree op;
5546       i = j % group_size;
5547
5548       /* Get the def before the loop.  In reduction chain we have only
5549          one initial value.  Else we have as many as PHIs in the group.  */
5550       if (i >= initial_values.length () || (j > i && neutral_op))
5551         op = neutral_op;
5552       else
5553         op = initial_values[i];
5554
5555       /* Create 'vect_ = {op0,op1,...,opn}'.  */
5556       number_of_places_left_in_vector--;
5557       elts[nunits - number_of_places_left_in_vector - 1] = op;
5558       if (!CONSTANT_CLASS_P (op))
5559         constant_p = false;
5560
5561       if (number_of_places_left_in_vector == 0)
5562         {
5563           tree init;
5564           if (constant_p && !neutral_op
5565               ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
5566               : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
5567             /* Build the vector directly from ELTS.  */
5568             init = gimple_build_vector (&ctor_seq, &elts);
5569           else if (neutral_op)
5570             {
5571               /* Build a vector of the neutral value and shift the
5572                  other elements into place.  */
5573               init = gimple_build_vector_from_val (&ctor_seq, vector_type,
5574                                                    neutral_op);
5575               int k = nunits;
5576               while (k > 0 && elts[k - 1] == neutral_op)
5577                 k -= 1;
5578               while (k > 0)
5579                 {
5580                   k -= 1;
5581                   init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
5582                                        vector_type, init, elts[k]);
5583                 }
5584             }
5585           else
5586             {
5587               /* First time round, duplicate ELTS to fill the
5588                  required number of vectors.  */
5589               duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
5590                                         elts, number_of_vectors, *vec_oprnds);
5591               break;
5592             }
5593           vec_oprnds->quick_push (init);
5594
5595           number_of_places_left_in_vector = nunits;
5596           elts.new_vector (vector_type, nunits, 1);
5597           elts.quick_grow (nunits);
5598           constant_p = true;
5599         }
5600     }
5601   if (ctor_seq != NULL)
5602     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
5603 }
5604
5605 /* For a statement STMT_INFO taking part in a reduction operation return
5606    the stmt_vec_info the meta information is stored on.  */
5607
5608 stmt_vec_info
5609 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
5610 {
5611   stmt_info = vect_orig_stmt (stmt_info);
5612   gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
5613   if (!is_a <gphi *> (stmt_info->stmt)
5614       || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
5615     stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5616   gphi *phi = as_a <gphi *> (stmt_info->stmt);
5617   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5618     {
5619       if (gimple_phi_num_args (phi) == 1)
5620         stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5621     }
5622   else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
5623     {
5624       stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
5625       if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
5626         stmt_info = info;
5627     }
5628   return stmt_info;
5629 }
5630
5631 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
5632    REDUC_INFO can build on.  Adjust REDUC_INFO and return true if so, otherwise
5633    return false.  */
5634
5635 static bool
5636 vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
5637                                 stmt_vec_info reduc_info)
5638 {
5639   loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
5640   if (!main_loop_vinfo)
5641     return false;
5642
5643   if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
5644     return false;
5645
5646   unsigned int num_phis = reduc_info->reduc_initial_values.length ();
5647   auto_vec<tree, 16> main_loop_results (num_phis);
5648   auto_vec<tree, 16> initial_values (num_phis);
5649   if (edge main_loop_edge = loop_vinfo->main_loop_edge)
5650     {
5651       /* The epilogue loop can be entered either from the main loop or
5652          from an earlier guard block.  */
5653       edge skip_edge = loop_vinfo->skip_main_loop_edge;
5654       for (tree incoming_value : reduc_info->reduc_initial_values)
5655         {
5656           /* Look for:
5657
5658                INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
5659                                     INITIAL_VALUE(guard block)>.  */
5660           gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
5661
5662           gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
5663           gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
5664
5665           tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
5666           tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
5667
5668           main_loop_results.quick_push (from_main_loop);
5669           initial_values.quick_push (from_skip);
5670         }
5671     }
5672   else
5673     /* The main loop dominates the epilogue loop.  */
5674     main_loop_results.splice (reduc_info->reduc_initial_values);
5675
5676   /* See if the main loop has the kind of accumulator we need.  */
5677   vect_reusable_accumulator *accumulator
5678     = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
5679   if (!accumulator
5680       || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
5681       || !std::equal (main_loop_results.begin (), main_loop_results.end (),
5682                       accumulator->reduc_info->reduc_scalar_results.begin ()))
5683     return false;
5684
5685   /* Handle the case where we can reduce wider vectors to narrower ones.  */
5686   tree vectype = STMT_VINFO_VECTYPE (reduc_info);
5687   tree old_vectype = TREE_TYPE (accumulator->reduc_input);
5688   unsigned HOST_WIDE_INT m;
5689   if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
5690                             TYPE_VECTOR_SUBPARTS (vectype), &m))
5691     return false;
5692   /* Check the intermediate vector types and operations are available.  */
5693   tree prev_vectype = old_vectype;
5694   poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
5695   while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
5696     {
5697       intermediate_nunits = exact_div (intermediate_nunits, 2);
5698       tree intermediate_vectype = get_related_vectype_for_scalar_type
5699         (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
5700       if (!intermediate_vectype
5701           || !directly_supported_p (STMT_VINFO_REDUC_CODE (reduc_info),
5702                                     intermediate_vectype)
5703           || !can_vec_extract (TYPE_MODE (prev_vectype),
5704                                TYPE_MODE (intermediate_vectype)))
5705         return false;
5706       prev_vectype = intermediate_vectype;
5707     }
5708
5709   /* Non-SLP reductions might apply an adjustment after the reduction
5710      operation, in order to simplify the initialization of the accumulator.
5711      If the epilogue loop carries on from where the main loop left off,
5712      it should apply the same adjustment to the final reduction result.
5713
5714      If the epilogue loop can also be entered directly (rather than via
5715      the main loop), we need to be able to handle that case in the same way,
5716      with the same adjustment.  (In principle we could add a PHI node
5717      to select the correct adjustment, but in practice that shouldn't be
5718      necessary.)  */
5719   tree main_adjustment
5720     = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5721   if (loop_vinfo->main_loop_edge && main_adjustment)
5722     {
5723       gcc_assert (num_phis == 1);
5724       tree initial_value = initial_values[0];
5725       /* Check that we can use INITIAL_VALUE as the adjustment and
5726          initialize the accumulator with a neutral value instead.  */
5727       if (!operand_equal_p (initial_value, main_adjustment))
5728         return false;
5729       code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5730       initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
5731                                                     code, initial_value);
5732     }
5733   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5734   reduc_info->reduc_initial_values.truncate (0);
5735   reduc_info->reduc_initial_values.splice (initial_values);
5736   reduc_info->reused_accumulator = accumulator;
5737   return true;
5738 }
5739
5740 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5741    CODE emitting stmts before GSI.  Returns a vector def of VECTYPE.  */
5742
5743 static tree
5744 vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5745                             gimple_seq *seq)
5746 {
5747   unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5748   unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5749   tree stype = TREE_TYPE (vectype);
5750   tree new_temp = vec_def;
5751   while (nunits > nunits1)
5752     {
5753       nunits /= 2;
5754       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5755                                                            stype, nunits);
5756       unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5757
5758       /* The target has to make sure we support lowpart/highpart
5759          extraction, either via direct vector extract or through
5760          an integer mode punning.  */
5761       tree dst1, dst2;
5762       gimple *epilog_stmt;
5763       if (convert_optab_handler (vec_extract_optab,
5764                                  TYPE_MODE (TREE_TYPE (new_temp)),
5765                                  TYPE_MODE (vectype1))
5766           != CODE_FOR_nothing)
5767         {
5768           /* Extract sub-vectors directly once vec_extract becomes
5769              a conversion optab.  */
5770           dst1 = make_ssa_name (vectype1);
5771           epilog_stmt
5772               = gimple_build_assign (dst1, BIT_FIELD_REF,
5773                                      build3 (BIT_FIELD_REF, vectype1,
5774                                              new_temp, TYPE_SIZE (vectype1),
5775                                              bitsize_int (0)));
5776           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5777           dst2 =  make_ssa_name (vectype1);
5778           epilog_stmt
5779               = gimple_build_assign (dst2, BIT_FIELD_REF,
5780                                      build3 (BIT_FIELD_REF, vectype1,
5781                                              new_temp, TYPE_SIZE (vectype1),
5782                                              bitsize_int (bitsize)));
5783           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5784         }
5785       else
5786         {
5787           /* Extract via punning to appropriately sized integer mode
5788              vector.  */
5789           tree eltype = build_nonstandard_integer_type (bitsize, 1);
5790           tree etype = build_vector_type (eltype, 2);
5791           gcc_assert (convert_optab_handler (vec_extract_optab,
5792                                              TYPE_MODE (etype),
5793                                              TYPE_MODE (eltype))
5794                       != CODE_FOR_nothing);
5795           tree tem = make_ssa_name (etype);
5796           epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5797                                              build1 (VIEW_CONVERT_EXPR,
5798                                                      etype, new_temp));
5799           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5800           new_temp = tem;
5801           tem = make_ssa_name (eltype);
5802           epilog_stmt
5803               = gimple_build_assign (tem, BIT_FIELD_REF,
5804                                      build3 (BIT_FIELD_REF, eltype,
5805                                              new_temp, TYPE_SIZE (eltype),
5806                                              bitsize_int (0)));
5807           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5808           dst1 = make_ssa_name (vectype1);
5809           epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5810                                              build1 (VIEW_CONVERT_EXPR,
5811                                                      vectype1, tem));
5812           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5813           tem = make_ssa_name (eltype);
5814           epilog_stmt
5815               = gimple_build_assign (tem, BIT_FIELD_REF,
5816                                      build3 (BIT_FIELD_REF, eltype,
5817                                              new_temp, TYPE_SIZE (eltype),
5818                                              bitsize_int (bitsize)));
5819           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5820           dst2 =  make_ssa_name (vectype1);
5821           epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5822                                              build1 (VIEW_CONVERT_EXPR,
5823                                                      vectype1, tem));
5824           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5825         }
5826
5827       new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5828     }
5829
5830   return new_temp;
5831 }
5832
5833 /* Function vect_create_epilog_for_reduction
5834
5835    Create code at the loop-epilog to finalize the result of a reduction
5836    computation.
5837
5838    STMT_INFO is the scalar reduction stmt that is being vectorized.
5839    SLP_NODE is an SLP node containing a group of reduction statements. The
5840      first one in this group is STMT_INFO.
5841    SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5842    REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5843      (counting from 0)
5844
5845    This function:
5846    1. Completes the reduction def-use cycles.
5847    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5848       by calling the function specified by REDUC_FN if available, or by
5849       other means (whole-vector shifts or a scalar loop).
5850       The function also creates a new phi node at the loop exit to preserve
5851       loop-closed form, as illustrated below.
5852
5853      The flow at the entry to this function:
5854
5855         loop:
5856           vec_def = phi <vec_init, null>        # REDUCTION_PHI
5857           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5858           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5859         loop_exit:
5860           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5861           use <s_out0>
5862           use <s_out0>
5863
5864      The above is transformed by this function into:
5865
5866         loop:
5867           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
5868           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5869           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5870         loop_exit:
5871           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5872           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5873           v_out2 = reduce <v_out1>
5874           s_out3 = extract_field <v_out2, 0>
5875           s_out4 = adjust_result <s_out3>
5876           use <s_out4>
5877           use <s_out4>
5878 */
5879
5880 static void
5881 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5882                                   stmt_vec_info stmt_info,
5883                                   slp_tree slp_node,
5884                                   slp_instance slp_node_instance)
5885 {
5886   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5887   gcc_assert (reduc_info->is_reduc_info);
5888   /* For double reductions we need to get at the inner loop reduction
5889      stmt which has the meta info attached.  Our stmt_info is that of the
5890      loop-closed PHI of the inner loop which we remember as
5891      def for the reduction PHI generation.  */
5892   bool double_reduc = false;
5893   stmt_vec_info rdef_info = stmt_info;
5894   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5895     {
5896       gcc_assert (!slp_node);
5897       double_reduc = true;
5898       stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5899                                             (stmt_info->stmt, 0));
5900       stmt_info = vect_stmt_to_vectorize (stmt_info);
5901     }
5902   gphi *reduc_def_stmt
5903     = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
5904   code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5905   internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
5906   tree vectype;
5907   machine_mode mode;
5908   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
5909   basic_block exit_bb;
5910   tree scalar_dest;
5911   tree scalar_type;
5912   gimple *new_phi = NULL, *phi = NULL;
5913   gimple_stmt_iterator exit_gsi;
5914   tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5915   gimple *epilog_stmt = NULL;
5916   gimple *exit_phi;
5917   tree bitsize;
5918   tree def;
5919   tree orig_name, scalar_result;
5920   imm_use_iterator imm_iter, phi_imm_iter;
5921   use_operand_p use_p, phi_use_p;
5922   gimple *use_stmt;
5923   auto_vec<tree> reduc_inputs;
5924   int j, i;
5925   vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
5926   unsigned int group_size = 1, k;
5927   auto_vec<gimple *> phis;
5928   /* SLP reduction without reduction chain, e.g.,
5929      # a1 = phi <a2, a0>
5930      # b1 = phi <b2, b0>
5931      a2 = operation (a1)
5932      b2 = operation (b1)  */
5933   bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5934   bool direct_slp_reduc;
5935   tree induction_index = NULL_TREE;
5936
5937   if (slp_node)
5938     group_size = SLP_TREE_LANES (slp_node);
5939
5940   if (nested_in_vect_loop_p (loop, stmt_info))
5941     {
5942       outer_loop = loop;
5943       loop = loop->inner;
5944       gcc_assert (!slp_node && double_reduc);
5945     }
5946
5947   vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
5948   gcc_assert (vectype);
5949   mode = TYPE_MODE (vectype);
5950
5951   tree induc_val = NULL_TREE;
5952   tree adjustment_def = NULL;
5953   if (slp_node)
5954     ;
5955   else
5956     {
5957       /* Optimize: for induction condition reduction, if we can't use zero
5958          for induc_val, use initial_def.  */
5959       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5960         induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
5961       else if (double_reduc)
5962         ;
5963       else
5964         adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
5965     }
5966
5967   stmt_vec_info single_live_out_stmt[] = { stmt_info };
5968   array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
5969   if (slp_reduc)
5970     /* All statements produce live-out values.  */
5971     live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
5972   else if (slp_node)
5973     {
5974       /* The last statement in the reduction chain produces the live-out
5975          value.  Note SLP optimization can shuffle scalar stmts to
5976          optimize permutations so we have to search for the last stmt.  */
5977       for (k = 0; k < group_size; ++k)
5978         if (!REDUC_GROUP_NEXT_ELEMENT (SLP_TREE_SCALAR_STMTS (slp_node)[k]))
5979           {
5980             single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5981             break;
5982           }
5983     }
5984
5985   unsigned vec_num;
5986   int ncopies;
5987   if (slp_node)
5988     {
5989       vec_num = SLP_TREE_VEC_DEFS (slp_node_instance->reduc_phis).length ();
5990       ncopies = 1;
5991     }
5992   else
5993     {
5994       stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
5995       vec_num = 1;
5996       ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
5997     }
5998
5999   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
6000      which is updated with the current index of the loop for every match of
6001      the original loop's cond_expr (VEC_STMT).  This results in a vector
6002      containing the last time the condition passed for that vector lane.
6003      The first match will be a 1 to allow 0 to be used for non-matching
6004      indexes.  If there are no matches at all then the vector will be all
6005      zeroes.
6006
6007      PR92772: This algorithm is broken for architectures that support
6008      masked vectors, but do not provide fold_extract_last.  */
6009   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
6010     {
6011       auto_vec<std::pair<tree, bool>, 2> ccompares;
6012       stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
6013       cond_info = vect_stmt_to_vectorize (cond_info);
6014       while (cond_info != reduc_info)
6015         {
6016           if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
6017             {
6018               gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
6019               gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
6020               ccompares.safe_push
6021                 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
6022                                  STMT_VINFO_REDUC_IDX (cond_info) == 2));
6023             }
6024           cond_info
6025             = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
6026                                                  1 + STMT_VINFO_REDUC_IDX
6027                                                         (cond_info)));
6028           cond_info = vect_stmt_to_vectorize (cond_info);
6029         }
6030       gcc_assert (ccompares.length () != 0);
6031
6032       tree indx_before_incr, indx_after_incr;
6033       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
6034       int scalar_precision
6035         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
6036       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
6037       tree cr_index_vector_type = get_related_vectype_for_scalar_type
6038         (TYPE_MODE (vectype), cr_index_scalar_type,
6039          TYPE_VECTOR_SUBPARTS (vectype));
6040
6041       /* First we create a simple vector induction variable which starts
6042          with the values {1,2,3,...} (SERIES_VECT) and increments by the
6043          vector size (STEP).  */
6044
6045       /* Create a {1,2,3,...} vector.  */
6046       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
6047
6048       /* Create a vector of the step value.  */
6049       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
6050       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
6051
6052       /* Create an induction variable.  */
6053       gimple_stmt_iterator incr_gsi;
6054       bool insert_after;
6055       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
6056       create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, &incr_gsi,
6057                  insert_after, &indx_before_incr, &indx_after_incr);
6058
6059       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
6060          filled with zeros (VEC_ZERO).  */
6061
6062       /* Create a vector of 0s.  */
6063       tree zero = build_zero_cst (cr_index_scalar_type);
6064       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
6065
6066       /* Create a vector phi node.  */
6067       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
6068       new_phi = create_phi_node (new_phi_tree, loop->header);
6069       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
6070                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
6071
6072       /* Now take the condition from the loops original cond_exprs
6073          and produce a new cond_exprs (INDEX_COND_EXPR) which for
6074          every match uses values from the induction variable
6075          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
6076          (NEW_PHI_TREE).
6077          Finally, we update the phi (NEW_PHI_TREE) to take the value of
6078          the new cond_expr (INDEX_COND_EXPR).  */
6079       gimple_seq stmts = NULL;
6080       for (int i = ccompares.length () - 1; i != -1; --i)
6081         {
6082           tree ccompare = ccompares[i].first;
6083           if (ccompares[i].second)
6084             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
6085                                          cr_index_vector_type,
6086                                          ccompare,
6087                                          indx_before_incr, new_phi_tree);
6088           else
6089             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
6090                                          cr_index_vector_type,
6091                                          ccompare,
6092                                          new_phi_tree, indx_before_incr);
6093         }
6094       gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
6095
6096       /* Update the phi with the vec cond.  */
6097       induction_index = new_phi_tree;
6098       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
6099                    loop_latch_edge (loop), UNKNOWN_LOCATION);
6100     }
6101
6102   /* 2. Create epilog code.
6103         The reduction epilog code operates across the elements of the vector
6104         of partial results computed by the vectorized loop.
6105         The reduction epilog code consists of:
6106
6107         step 1: compute the scalar result in a vector (v_out2)
6108         step 2: extract the scalar result (s_out3) from the vector (v_out2)
6109         step 3: adjust the scalar result (s_out3) if needed.
6110
6111         Step 1 can be accomplished using one the following three schemes:
6112           (scheme 1) using reduc_fn, if available.
6113           (scheme 2) using whole-vector shifts, if available.
6114           (scheme 3) using a scalar loop. In this case steps 1+2 above are
6115                      combined.
6116
6117           The overall epilog code looks like this:
6118
6119           s_out0 = phi <s_loop>         # original EXIT_PHI
6120           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
6121           v_out2 = reduce <v_out1>              # step 1
6122           s_out3 = extract_field <v_out2, 0>    # step 2
6123           s_out4 = adjust_result <s_out3>       # step 3
6124
6125           (step 3 is optional, and steps 1 and 2 may be combined).
6126           Lastly, the uses of s_out0 are replaced by s_out4.  */
6127
6128
6129   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
6130          v_out1 = phi <VECT_DEF>
6131          Store them in NEW_PHIS.  */
6132   if (double_reduc)
6133     loop = outer_loop;
6134   exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
6135   exit_gsi = gsi_after_labels (exit_bb);
6136   reduc_inputs.create (slp_node ? vec_num : ncopies);
6137   for (unsigned i = 0; i < vec_num; i++)
6138     {
6139       gimple_seq stmts = NULL;
6140       if (slp_node)
6141         def = vect_get_slp_vect_def (slp_node, i);
6142       else
6143         def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
6144       for (j = 0; j < ncopies; j++)
6145         {
6146           tree new_def = copy_ssa_name (def);
6147           phi = create_phi_node (new_def, exit_bb);
6148           if (j)
6149             def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
6150           SET_PHI_ARG_DEF (phi, LOOP_VINFO_IV_EXIT (loop_vinfo)->dest_idx, def);
6151           new_def = gimple_convert (&stmts, vectype, new_def);
6152           reduc_inputs.quick_push (new_def);
6153         }
6154       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6155     }
6156
6157   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
6158          (i.e. when reduc_fn is not available) and in the final adjustment
6159          code (if needed).  Also get the original scalar reduction variable as
6160          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
6161          represents a reduction pattern), the tree-code and scalar-def are
6162          taken from the original stmt that the pattern-stmt (STMT) replaces.
6163          Otherwise (it is a regular reduction) - the tree-code and scalar-def
6164          are taken from STMT.  */
6165
6166   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6167   if (orig_stmt_info != stmt_info)
6168     {
6169       /* Reduction pattern  */
6170       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6171       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
6172     }
6173
6174   scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
6175   scalar_type = TREE_TYPE (scalar_dest);
6176   scalar_results.truncate (0);
6177   scalar_results.reserve_exact (group_size);
6178   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
6179   bitsize = TYPE_SIZE (scalar_type);
6180
6181   /* True if we should implement SLP_REDUC using native reduction operations
6182      instead of scalar operations.  */
6183   direct_slp_reduc = (reduc_fn != IFN_LAST
6184                       && slp_reduc
6185                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
6186
6187   /* In case of reduction chain, e.g.,
6188      # a1 = phi <a3, a0>
6189      a2 = operation (a1)
6190      a3 = operation (a2),
6191
6192      we may end up with more than one vector result.  Here we reduce them
6193      to one vector.
6194
6195      The same is true for a SLP reduction, e.g.,
6196      # a1 = phi <a2, a0>
6197      # b1 = phi <b2, b0>
6198      a2 = operation (a1)
6199      b2 = operation (a2),
6200
6201      where we can end up with more than one vector as well.  We can
6202      easily accumulate vectors when the number of vector elements is
6203      a multiple of the SLP group size.
6204
6205      The same is true if we couldn't use a single defuse cycle.  */
6206   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6207       || direct_slp_reduc
6208       || (slp_reduc
6209           && constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype), group_size))
6210       || ncopies > 1)
6211     {
6212       gimple_seq stmts = NULL;
6213       tree single_input = reduc_inputs[0];
6214       for (k = 1; k < reduc_inputs.length (); k++)
6215         single_input = gimple_build (&stmts, code, vectype,
6216                                      single_input, reduc_inputs[k]);
6217       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6218
6219       reduc_inputs.truncate (0);
6220       reduc_inputs.safe_push (single_input);
6221     }
6222
6223   tree orig_reduc_input = reduc_inputs[0];
6224
6225   /* If this loop is an epilogue loop that can be skipped after the
6226      main loop, we can only share a reduction operation between the
6227      main loop and the epilogue if we put it at the target of the
6228      skip edge.
6229
6230      We can still reuse accumulators if this check fails.  Doing so has
6231      the minor(?) benefit of making the epilogue loop's scalar result
6232      independent of the main loop's scalar result.  */
6233   bool unify_with_main_loop_p = false;
6234   if (reduc_info->reused_accumulator
6235       && loop_vinfo->skip_this_loop_edge
6236       && single_succ_p (exit_bb)
6237       && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
6238     {
6239       unify_with_main_loop_p = true;
6240
6241       basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
6242       reduc_inputs[0] = make_ssa_name (vectype);
6243       gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
6244       add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
6245                    UNKNOWN_LOCATION);
6246       add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
6247                    loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
6248       exit_gsi = gsi_after_labels (reduc_block);
6249     }
6250
6251   /* Shouldn't be used beyond this point.  */
6252   exit_bb = nullptr;
6253
6254   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6255       && reduc_fn != IFN_LAST)
6256     {
6257       /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
6258          various data values where the condition matched and another vector
6259          (INDUCTION_INDEX) containing all the indexes of those matches.  We
6260          need to extract the last matching index (which will be the index with
6261          highest value) and use this to index into the data vector.
6262          For the case where there were no matches, the data vector will contain
6263          all default values and the index vector will be all zeros.  */
6264
6265       /* Get various versions of the type of the vector of indexes.  */
6266       tree index_vec_type = TREE_TYPE (induction_index);
6267       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
6268       tree index_scalar_type = TREE_TYPE (index_vec_type);
6269       tree index_vec_cmp_type = truth_type_for (index_vec_type);
6270
6271       /* Get an unsigned integer version of the type of the data vector.  */
6272       int scalar_precision
6273         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6274       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
6275       tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
6276                                                 vectype);
6277
6278       /* First we need to create a vector (ZERO_VEC) of zeros and another
6279          vector (MAX_INDEX_VEC) filled with the last matching index, which we
6280          can create using a MAX reduction and then expanding.
6281          In the case where the loop never made any matches, the max index will
6282          be zero.  */
6283
6284       /* Vector of {0, 0, 0,...}.  */
6285       tree zero_vec = build_zero_cst (vectype);
6286
6287       /* Find maximum value from the vector of found indexes.  */
6288       tree max_index = make_ssa_name (index_scalar_type);
6289       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6290                                                           1, induction_index);
6291       gimple_call_set_lhs (max_index_stmt, max_index);
6292       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
6293
6294       /* Vector of {max_index, max_index, max_index,...}.  */
6295       tree max_index_vec = make_ssa_name (index_vec_type);
6296       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
6297                                                       max_index);
6298       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
6299                                                         max_index_vec_rhs);
6300       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
6301
6302       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
6303          with the vector (INDUCTION_INDEX) of found indexes, choosing values
6304          from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
6305          otherwise.  Only one value should match, resulting in a vector
6306          (VEC_COND) with one data value and the rest zeros.
6307          In the case where the loop never made any matches, every index will
6308          match, resulting in a vector with all data values (which will all be
6309          the default value).  */
6310
6311       /* Compare the max index vector to the vector of found indexes to find
6312          the position of the max value.  */
6313       tree vec_compare = make_ssa_name (index_vec_cmp_type);
6314       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
6315                                                       induction_index,
6316                                                       max_index_vec);
6317       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
6318
6319       /* Use the compare to choose either values from the data vector or
6320          zero.  */
6321       tree vec_cond = make_ssa_name (vectype);
6322       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
6323                                                    vec_compare,
6324                                                    reduc_inputs[0],
6325                                                    zero_vec);
6326       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
6327
6328       /* Finally we need to extract the data value from the vector (VEC_COND)
6329          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
6330          reduction, but because this doesn't exist, we can use a MAX reduction
6331          instead.  The data value might be signed or a float so we need to cast
6332          it first.
6333          In the case where the loop never made any matches, the data values are
6334          all identical, and so will reduce down correctly.  */
6335
6336       /* Make the matched data values unsigned.  */
6337       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
6338       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
6339                                        vec_cond);
6340       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
6341                                                         VIEW_CONVERT_EXPR,
6342                                                         vec_cond_cast_rhs);
6343       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
6344
6345       /* Reduce down to a scalar value.  */
6346       tree data_reduc = make_ssa_name (scalar_type_unsigned);
6347       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6348                                                            1, vec_cond_cast);
6349       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
6350       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
6351
6352       /* Convert the reduced value back to the result type and set as the
6353          result.  */
6354       gimple_seq stmts = NULL;
6355       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
6356                                data_reduc);
6357       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6358       scalar_results.safe_push (new_temp);
6359     }
6360   else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6361            && reduc_fn == IFN_LAST)
6362     {
6363       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
6364          idx = 0;
6365          idx_val = induction_index[0];
6366          val = data_reduc[0];
6367          for (idx = 0, val = init, i = 0; i < nelts; ++i)
6368            if (induction_index[i] > idx_val)
6369              val = data_reduc[i], idx_val = induction_index[i];
6370          return val;  */
6371
6372       tree data_eltype = TREE_TYPE (vectype);
6373       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
6374       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
6375       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
6376       /* Enforced by vectorizable_reduction, which ensures we have target
6377          support before allowing a conditional reduction on variable-length
6378          vectors.  */
6379       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
6380       tree idx_val = NULL_TREE, val = NULL_TREE;
6381       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
6382         {
6383           tree old_idx_val = idx_val;
6384           tree old_val = val;
6385           idx_val = make_ssa_name (idx_eltype);
6386           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
6387                                              build3 (BIT_FIELD_REF, idx_eltype,
6388                                                      induction_index,
6389                                                      bitsize_int (el_size),
6390                                                      bitsize_int (off)));
6391           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6392           val = make_ssa_name (data_eltype);
6393           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
6394                                              build3 (BIT_FIELD_REF,
6395                                                      data_eltype,
6396                                                      reduc_inputs[0],
6397                                                      bitsize_int (el_size),
6398                                                      bitsize_int (off)));
6399           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6400           if (off != 0)
6401             {
6402               tree new_idx_val = idx_val;
6403               if (off != v_size - el_size)
6404                 {
6405                   new_idx_val = make_ssa_name (idx_eltype);
6406                   epilog_stmt = gimple_build_assign (new_idx_val,
6407                                                      MAX_EXPR, idx_val,
6408                                                      old_idx_val);
6409                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6410                 }
6411               tree cond = make_ssa_name (boolean_type_node);
6412               epilog_stmt = gimple_build_assign (cond, GT_EXPR,
6413                                                  idx_val, old_idx_val);
6414               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6415               tree new_val = make_ssa_name (data_eltype);
6416               epilog_stmt = gimple_build_assign (new_val, COND_EXPR,
6417                                                  cond, val, old_val);
6418               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6419               idx_val = new_idx_val;
6420               val = new_val;
6421             }
6422         }
6423       /* Convert the reduced value back to the result type and set as the
6424          result.  */
6425       gimple_seq stmts = NULL;
6426       val = gimple_convert (&stmts, scalar_type, val);
6427       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6428       scalar_results.safe_push (val);
6429     }
6430
6431   /* 2.3 Create the reduction code, using one of the three schemes described
6432          above. In SLP we simply need to extract all the elements from the
6433          vector (without reducing them), so we use scalar shifts.  */
6434   else if (reduc_fn != IFN_LAST && !slp_reduc)
6435     {
6436       tree tmp;
6437       tree vec_elem_type;
6438
6439       /* Case 1:  Create:
6440          v_out2 = reduc_expr <v_out1>  */
6441
6442       if (dump_enabled_p ())
6443         dump_printf_loc (MSG_NOTE, vect_location,
6444                          "Reduce using direct vector reduction.\n");
6445
6446       gimple_seq stmts = NULL;
6447       vec_elem_type = TREE_TYPE (vectype);
6448       new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
6449                                vec_elem_type, reduc_inputs[0]);
6450       new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6451       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6452
6453       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6454           && induc_val)
6455         {
6456           /* Earlier we set the initial value to be a vector if induc_val
6457              values.  Check the result and if it is induc_val then replace
6458              with the original initial value, unless induc_val is
6459              the same as initial_def already.  */
6460           tree zcompare = make_ssa_name (boolean_type_node);
6461           epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
6462                                              new_temp, induc_val);
6463           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6464           tree initial_def = reduc_info->reduc_initial_values[0];
6465           tmp = make_ssa_name (new_scalar_dest);
6466           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6467                                              initial_def, new_temp);
6468           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6469           new_temp = tmp;
6470         }
6471
6472       scalar_results.safe_push (new_temp);
6473     }
6474   else if (direct_slp_reduc)
6475     {
6476       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
6477          with the elements for other SLP statements replaced with the
6478          neutral value.  We can then do a normal reduction on each vector.  */
6479
6480       /* Enforced by vectorizable_reduction.  */
6481       gcc_assert (reduc_inputs.length () == 1);
6482       gcc_assert (pow2p_hwi (group_size));
6483
6484       gimple_seq seq = NULL;
6485
6486       /* Build a vector {0, 1, 2, ...}, with the same number of elements
6487          and the same element size as VECTYPE.  */
6488       tree index = build_index_vector (vectype, 0, 1);
6489       tree index_type = TREE_TYPE (index);
6490       tree index_elt_type = TREE_TYPE (index_type);
6491       tree mask_type = truth_type_for (index_type);
6492
6493       /* Create a vector that, for each element, identifies which of
6494          the REDUC_GROUP_SIZE results should use it.  */
6495       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
6496       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
6497                             build_vector_from_val (index_type, index_mask));
6498
6499       /* Get a neutral vector value.  This is simply a splat of the neutral
6500          scalar value if we have one, otherwise the initial scalar value
6501          is itself a neutral value.  */
6502       tree vector_identity = NULL_TREE;
6503       tree neutral_op = NULL_TREE;
6504       if (slp_node)
6505         {
6506           tree initial_value = NULL_TREE;
6507           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6508             initial_value = reduc_info->reduc_initial_values[0];
6509           neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
6510                                                  initial_value, false);
6511         }
6512       if (neutral_op)
6513         vector_identity = gimple_build_vector_from_val (&seq, vectype,
6514                                                         neutral_op);
6515       for (unsigned int i = 0; i < group_size; ++i)
6516         {
6517           /* If there's no univeral neutral value, we can use the
6518              initial scalar value from the original PHI.  This is used
6519              for MIN and MAX reduction, for example.  */
6520           if (!neutral_op)
6521             {
6522               tree scalar_value = reduc_info->reduc_initial_values[i];
6523               scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
6524                                              scalar_value);
6525               vector_identity = gimple_build_vector_from_val (&seq, vectype,
6526                                                               scalar_value);
6527             }
6528
6529           /* Calculate the equivalent of:
6530
6531              sel[j] = (index[j] == i);
6532
6533              which selects the elements of REDUC_INPUTS[0] that should
6534              be included in the result.  */
6535           tree compare_val = build_int_cst (index_elt_type, i);
6536           compare_val = build_vector_from_val (index_type, compare_val);
6537           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
6538                                    index, compare_val);
6539
6540           /* Calculate the equivalent of:
6541
6542              vec = seq ? reduc_inputs[0] : vector_identity;
6543
6544              VEC is now suitable for a full vector reduction.  */
6545           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
6546                                    sel, reduc_inputs[0], vector_identity);
6547
6548           /* Do the reduction and convert it to the appropriate type.  */
6549           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
6550                                       TREE_TYPE (vectype), vec);
6551           scalar = gimple_convert (&seq, scalar_type, scalar);
6552           scalar_results.safe_push (scalar);
6553         }
6554       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
6555     }
6556   else
6557     {
6558       bool reduce_with_shift;
6559       tree vec_temp;
6560
6561       gcc_assert (slp_reduc || reduc_inputs.length () == 1);
6562
6563       /* See if the target wants to do the final (shift) reduction
6564          in a vector mode of smaller size and first reduce upper/lower
6565          halves against each other.  */
6566       enum machine_mode mode1 = mode;
6567       tree stype = TREE_TYPE (vectype);
6568       unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6569       unsigned nunits1 = nunits;
6570       if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
6571           && reduc_inputs.length () == 1)
6572         {
6573           nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6574           /* For SLP reductions we have to make sure lanes match up, but
6575              since we're doing individual element final reduction reducing
6576              vector width here is even more important.
6577              ???  We can also separate lanes with permutes, for the common
6578              case of power-of-two group-size odd/even extracts would work.  */
6579           if (slp_reduc && nunits != nunits1)
6580             {
6581               nunits1 = least_common_multiple (nunits1, group_size);
6582               gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
6583             }
6584         }
6585       if (!slp_reduc
6586           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
6587         nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6588
6589       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
6590                                                            stype, nunits1);
6591       reduce_with_shift = have_whole_vector_shift (mode1);
6592       if (!VECTOR_MODE_P (mode1)
6593           || !directly_supported_p (code, vectype1))
6594         reduce_with_shift = false;
6595
6596       /* First reduce the vector to the desired vector size we should
6597          do shift reduction on by combining upper and lower halves.  */
6598       gimple_seq stmts = NULL;
6599       new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
6600                                              code, &stmts);
6601       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6602       reduc_inputs[0] = new_temp;
6603
6604       if (reduce_with_shift && !slp_reduc)
6605         {
6606           int element_bitsize = tree_to_uhwi (bitsize);
6607           /* Enforced by vectorizable_reduction, which disallows SLP reductions
6608              for variable-length vectors and also requires direct target support
6609              for loop reductions.  */
6610           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6611           int nelements = vec_size_in_bits / element_bitsize;
6612           vec_perm_builder sel;
6613           vec_perm_indices indices;
6614
6615           int elt_offset;
6616
6617           tree zero_vec = build_zero_cst (vectype1);
6618           /* Case 2: Create:
6619              for (offset = nelements/2; offset >= 1; offset/=2)
6620                 {
6621                   Create:  va' = vec_shift <va, offset>
6622                   Create:  va = vop <va, va'>
6623                 }  */
6624
6625           tree rhs;
6626
6627           if (dump_enabled_p ())
6628             dump_printf_loc (MSG_NOTE, vect_location,
6629                              "Reduce using vector shifts\n");
6630
6631           gimple_seq stmts = NULL;
6632           new_temp = gimple_convert (&stmts, vectype1, new_temp);
6633           for (elt_offset = nelements / 2;
6634                elt_offset >= 1;
6635                elt_offset /= 2)
6636             {
6637               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
6638               indices.new_vector (sel, 2, nelements);
6639               tree mask = vect_gen_perm_mask_any (vectype1, indices);
6640               new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
6641                                        new_temp, zero_vec, mask);
6642               new_temp = gimple_build (&stmts, code,
6643                                        vectype1, new_name, new_temp);
6644             }
6645           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6646
6647           /* 2.4  Extract the final scalar result.  Create:
6648              s_out3 = extract_field <v_out2, bitpos>  */
6649
6650           if (dump_enabled_p ())
6651             dump_printf_loc (MSG_NOTE, vect_location,
6652                              "extract scalar result\n");
6653
6654           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
6655                         bitsize, bitsize_zero_node);
6656           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
6657           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
6658           gimple_assign_set_lhs (epilog_stmt, new_temp);
6659           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6660           scalar_results.safe_push (new_temp);
6661         }
6662       else
6663         {
6664           /* Case 3: Create:
6665              s = extract_field <v_out2, 0>
6666              for (offset = element_size;
6667                   offset < vector_size;
6668                   offset += element_size;)
6669                {
6670                  Create:  s' = extract_field <v_out2, offset>
6671                  Create:  s = op <s, s'>  // For non SLP cases
6672                }  */
6673
6674           if (dump_enabled_p ())
6675             dump_printf_loc (MSG_NOTE, vect_location,
6676                              "Reduce using scalar code.\n");
6677
6678           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6679           int element_bitsize = tree_to_uhwi (bitsize);
6680           tree compute_type = TREE_TYPE (vectype);
6681           gimple_seq stmts = NULL;
6682           FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
6683             {
6684               int bit_offset;
6685               new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
6686                                        vec_temp, bitsize, bitsize_zero_node);
6687
6688               /* In SLP we don't need to apply reduction operation, so we just
6689                  collect s' values in SCALAR_RESULTS.  */
6690               if (slp_reduc)
6691                 scalar_results.safe_push (new_temp);
6692
6693               for (bit_offset = element_bitsize;
6694                    bit_offset < vec_size_in_bits;
6695                    bit_offset += element_bitsize)
6696                 {
6697                   tree bitpos = bitsize_int (bit_offset);
6698                   new_name = gimple_build (&stmts, BIT_FIELD_REF,
6699                                            compute_type, vec_temp,
6700                                            bitsize, bitpos);
6701                   if (slp_reduc)
6702                     {
6703                       /* In SLP we don't need to apply reduction operation, so
6704                          we just collect s' values in SCALAR_RESULTS.  */
6705                       new_temp = new_name;
6706                       scalar_results.safe_push (new_name);
6707                     }
6708                   else
6709                     new_temp = gimple_build (&stmts, code, compute_type,
6710                                              new_name, new_temp);
6711                 }
6712             }
6713
6714           /* The only case where we need to reduce scalar results in SLP, is
6715              unrolling.  If the size of SCALAR_RESULTS is greater than
6716              REDUC_GROUP_SIZE, we reduce them combining elements modulo
6717              REDUC_GROUP_SIZE.  */
6718           if (slp_reduc)
6719             {
6720               tree res, first_res, new_res;
6721
6722               /* Reduce multiple scalar results in case of SLP unrolling.  */
6723               for (j = group_size; scalar_results.iterate (j, &res);
6724                    j++)
6725                 {
6726                   first_res = scalar_results[j % group_size];
6727                   new_res = gimple_build (&stmts, code, compute_type,
6728                                           first_res, res);
6729                   scalar_results[j % group_size] = new_res;
6730                 }
6731               scalar_results.truncate (group_size);
6732               for (k = 0; k < group_size; k++)
6733                 scalar_results[k] = gimple_convert (&stmts, scalar_type,
6734                                                     scalar_results[k]);
6735             }
6736           else
6737             {
6738               /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
6739               new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6740               scalar_results.safe_push (new_temp);
6741             }
6742
6743           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6744         }
6745
6746       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6747           && induc_val)
6748         {
6749           /* Earlier we set the initial value to be a vector if induc_val
6750              values.  Check the result and if it is induc_val then replace
6751              with the original initial value, unless induc_val is
6752              the same as initial_def already.  */
6753           tree zcompare = make_ssa_name (boolean_type_node);
6754           epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR, new_temp,
6755                                              induc_val);
6756           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6757           tree initial_def = reduc_info->reduc_initial_values[0];
6758           tree tmp = make_ssa_name (new_scalar_dest);
6759           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6760                                              initial_def, new_temp);
6761           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6762           scalar_results[0] = tmp;
6763         }
6764     }
6765
6766   /* 2.5 Adjust the final result by the initial value of the reduction
6767          variable. (When such adjustment is not needed, then
6768          'adjustment_def' is zero).  For example, if code is PLUS we create:
6769          new_temp = loop_exit_def + adjustment_def  */
6770
6771   if (adjustment_def)
6772     {
6773       gcc_assert (!slp_reduc);
6774       gimple_seq stmts = NULL;
6775       if (double_reduc)
6776         {
6777           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6778           adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6779           new_temp = gimple_build (&stmts, code, vectype,
6780                                    reduc_inputs[0], adjustment_def);
6781         }
6782       else
6783         {
6784           new_temp = scalar_results[0];
6785           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6786           adjustment_def = gimple_convert (&stmts, TREE_TYPE (vectype),
6787                                            adjustment_def);
6788           new_temp = gimple_convert (&stmts, TREE_TYPE (vectype), new_temp);
6789           new_temp = gimple_build (&stmts, code, TREE_TYPE (vectype),
6790                                    new_temp, adjustment_def);
6791           new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6792         }
6793
6794       epilog_stmt = gimple_seq_last_stmt (stmts);
6795       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6796       scalar_results[0] = new_temp;
6797     }
6798
6799   /* Record this operation if it could be reused by the epilogue loop.  */
6800   if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION
6801       && reduc_inputs.length () == 1)
6802     loop_vinfo->reusable_accumulators.put (scalar_results[0],
6803                                            { orig_reduc_input, reduc_info });
6804
6805   if (double_reduc)
6806     loop = outer_loop;
6807
6808   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
6809           phis with new adjusted scalar results, i.e., replace use <s_out0>
6810           with use <s_out4>.
6811
6812      Transform:
6813         loop_exit:
6814           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6815           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6816           v_out2 = reduce <v_out1>
6817           s_out3 = extract_field <v_out2, 0>
6818           s_out4 = adjust_result <s_out3>
6819           use <s_out0>
6820           use <s_out0>
6821
6822      into:
6823
6824         loop_exit:
6825           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6826           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6827           v_out2 = reduce <v_out1>
6828           s_out3 = extract_field <v_out2, 0>
6829           s_out4 = adjust_result <s_out3>
6830           use <s_out4>
6831           use <s_out4> */
6832
6833   gcc_assert (live_out_stmts.size () == scalar_results.length ());
6834   for (k = 0; k < live_out_stmts.size (); k++)
6835     {
6836       stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6837       scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6838
6839       phis.create (3);
6840       /* Find the loop-closed-use at the loop exit of the original scalar
6841          result.  (The reduction result is expected to have two immediate uses,
6842          one at the latch block, and one at the loop exit).  For double
6843          reductions we are looking for exit phis of the outer loop.  */
6844       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6845         {
6846           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6847             {
6848               if (!is_gimple_debug (USE_STMT (use_p)))
6849                 phis.safe_push (USE_STMT (use_p));
6850             }
6851           else
6852             {
6853               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6854                 {
6855                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
6856
6857                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6858                     {
6859                       if (!flow_bb_inside_loop_p (loop,
6860                                              gimple_bb (USE_STMT (phi_use_p)))
6861                           && !is_gimple_debug (USE_STMT (phi_use_p)))
6862                         phis.safe_push (USE_STMT (phi_use_p));
6863                     }
6864                 }
6865             }
6866         }
6867
6868       FOR_EACH_VEC_ELT (phis, i, exit_phi)
6869         {
6870           /* Replace the uses:  */
6871           orig_name = PHI_RESULT (exit_phi);
6872
6873           /* Look for a single use at the target of the skip edge.  */
6874           if (unify_with_main_loop_p)
6875             {
6876               use_operand_p use_p;
6877               gimple *user;
6878               if (!single_imm_use (orig_name, &use_p, &user))
6879                 gcc_unreachable ();
6880               orig_name = gimple_get_lhs (user);
6881             }
6882
6883           scalar_result = scalar_results[k];
6884           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6885             {
6886               FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6887                 SET_USE (use_p, scalar_result);
6888               update_stmt (use_stmt);
6889             }
6890         }
6891
6892       phis.release ();
6893     }
6894 }
6895
6896 /* Return a vector of type VECTYPE that is equal to the vector select
6897    operation "MASK ? VEC : IDENTITY".  Insert the select statements
6898    before GSI.  */
6899
6900 static tree
6901 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6902                      tree vec, tree identity)
6903 {
6904   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6905   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6906                                           mask, vec, identity);
6907   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6908   return cond;
6909 }
6910
6911 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6912    order, starting with LHS.  Insert the extraction statements before GSI and
6913    associate the new scalar SSA names with variable SCALAR_DEST.
6914    If MASK is nonzero mask the input and then operate on it unconditionally.
6915    Return the SSA name for the result.  */
6916
6917 static tree
6918 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6919                        tree_code code, tree lhs, tree vector_rhs,
6920                        tree mask)
6921 {
6922   tree vectype = TREE_TYPE (vector_rhs);
6923   tree scalar_type = TREE_TYPE (vectype);
6924   tree bitsize = TYPE_SIZE (scalar_type);
6925   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6926   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6927
6928   /* Re-create a VEC_COND_EXPR to mask the input here in order to be able
6929      to perform an unconditional element-wise reduction of it.  */
6930   if (mask)
6931     {
6932       tree masked_vector_rhs = make_temp_ssa_name (vectype, NULL,
6933                                                    "masked_vector_rhs");
6934       tree neutral_op = neutral_op_for_reduction (scalar_type, code, NULL_TREE,
6935                                                   false);
6936       tree vector_identity = build_vector_from_val (vectype, neutral_op);
6937       gassign *select = gimple_build_assign (masked_vector_rhs, VEC_COND_EXPR,
6938                                              mask, vector_rhs, vector_identity);
6939       gsi_insert_before (gsi, select, GSI_SAME_STMT);
6940       vector_rhs = masked_vector_rhs;
6941     }
6942
6943   for (unsigned HOST_WIDE_INT bit_offset = 0;
6944        bit_offset < vec_size_in_bits;
6945        bit_offset += element_bitsize)
6946     {
6947       tree bitpos = bitsize_int (bit_offset);
6948       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6949                          bitsize, bitpos);
6950
6951       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6952       rhs = make_ssa_name (scalar_dest, stmt);
6953       gimple_assign_set_lhs (stmt, rhs);
6954       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6955
6956       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6957       tree new_name = make_ssa_name (scalar_dest, stmt);
6958       gimple_assign_set_lhs (stmt, new_name);
6959       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6960       lhs = new_name;
6961     }
6962   return lhs;
6963 }
6964
6965 /* Get a masked internal function equivalent to REDUC_FN.  VECTYPE_IN is the
6966    type of the vector input.  */
6967
6968 static internal_fn
6969 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6970 {
6971   internal_fn mask_reduc_fn;
6972   internal_fn mask_len_reduc_fn;
6973
6974   switch (reduc_fn)
6975     {
6976     case IFN_FOLD_LEFT_PLUS:
6977       mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6978       mask_len_reduc_fn = IFN_MASK_LEN_FOLD_LEFT_PLUS;
6979       break;
6980
6981     default:
6982       return IFN_LAST;
6983     }
6984
6985   if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
6986                                       OPTIMIZE_FOR_SPEED))
6987     return mask_reduc_fn;
6988   if (direct_internal_fn_supported_p (mask_len_reduc_fn, vectype_in,
6989                                       OPTIMIZE_FOR_SPEED))
6990     return mask_len_reduc_fn;
6991   return IFN_LAST;
6992 }
6993
6994 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
6995    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
6996    statement.  CODE is the operation performed by STMT_INFO and OPS are
6997    its scalar operands.  REDUC_INDEX is the index of the operand in
6998    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
6999    implements in-order reduction, or IFN_LAST if we should open-code it.
7000    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
7001    that should be used to control the operation in a fully-masked loop.  */
7002
7003 static bool
7004 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
7005                                stmt_vec_info stmt_info,
7006                                gimple_stmt_iterator *gsi,
7007                                gimple **vec_stmt, slp_tree slp_node,
7008                                gimple *reduc_def_stmt,
7009                                code_helper code, internal_fn reduc_fn,
7010                                tree *ops, int num_ops, tree vectype_in,
7011                                int reduc_index, vec_loop_masks *masks,
7012                                vec_loop_lens *lens)
7013 {
7014   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7015   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7016   internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
7017
7018   int ncopies;
7019   if (slp_node)
7020     ncopies = 1;
7021   else
7022     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7023
7024   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
7025   gcc_assert (ncopies == 1);
7026
7027   bool is_cond_op = false;
7028   if (!code.is_tree_code ())
7029     {
7030       code = conditional_internal_fn_code (internal_fn (code));
7031       gcc_assert (code != ERROR_MARK);
7032       is_cond_op = true;
7033     }
7034
7035   gcc_assert (TREE_CODE_LENGTH (tree_code (code)) == binary_op);
7036
7037   if (slp_node)
7038     {
7039       if (is_cond_op)
7040         {
7041           if (dump_enabled_p ())
7042             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7043                              "fold-left reduction on SLP not supported.\n");
7044           return false;
7045         }
7046
7047       gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
7048                             TYPE_VECTOR_SUBPARTS (vectype_in)));
7049     }
7050
7051   /* The operands either come from a binary operation or an IFN_COND operation.
7052      The former is a gimple assign with binary rhs and the latter is a
7053      gimple call with four arguments.  */
7054   gcc_assert (num_ops == 2 || num_ops == 4);
7055   tree op0, opmask;
7056   if (!is_cond_op)
7057     op0 = ops[1 - reduc_index];
7058   else
7059     {
7060       op0 = ops[2];
7061       opmask = ops[0];
7062       gcc_assert (!slp_node);
7063     }
7064
7065   int group_size = 1;
7066   stmt_vec_info scalar_dest_def_info;
7067   auto_vec<tree> vec_oprnds0, vec_opmask;
7068   if (slp_node)
7069     {
7070       auto_vec<vec<tree> > vec_defs (2);
7071       vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
7072       vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
7073       vec_defs[0].release ();
7074       vec_defs[1].release ();
7075       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7076       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
7077     }
7078   else
7079     {
7080       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7081                                      op0, &vec_oprnds0);
7082       scalar_dest_def_info = stmt_info;
7083
7084       /* For an IFN_COND_OP we also need the vector mask operand.  */
7085       if (is_cond_op)
7086           vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7087                                          opmask, &vec_opmask);
7088     }
7089
7090   gimple *sdef = scalar_dest_def_info->stmt;
7091   tree scalar_dest = gimple_get_lhs (sdef);
7092   tree scalar_type = TREE_TYPE (scalar_dest);
7093   tree reduc_var = gimple_phi_result (reduc_def_stmt);
7094
7095   int vec_num = vec_oprnds0.length ();
7096   gcc_assert (vec_num == 1 || slp_node);
7097   tree vec_elem_type = TREE_TYPE (vectype_out);
7098   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
7099
7100   tree vector_identity = NULL_TREE;
7101   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7102     {
7103       vector_identity = build_zero_cst (vectype_out);
7104       if (!HONOR_SIGNED_ZEROS (vectype_out))
7105         ;
7106       else
7107         {
7108           gcc_assert (!HONOR_SIGN_DEPENDENT_ROUNDING (vectype_out));
7109           vector_identity = const_unop (NEGATE_EXPR, vectype_out,
7110                                         vector_identity);
7111         }
7112     }
7113
7114   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
7115   int i;
7116   tree def0;
7117   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7118     {
7119       gimple *new_stmt;
7120       tree mask = NULL_TREE;
7121       tree len = NULL_TREE;
7122       tree bias = NULL_TREE;
7123       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7124         mask = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num, vectype_in, i);
7125       else if (is_cond_op)
7126         mask = vec_opmask[0];
7127       if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
7128         {
7129           len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
7130                                    i, 1);
7131           signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
7132           bias = build_int_cst (intQI_type_node, biasval);
7133           if (!is_cond_op)
7134             mask = build_minus_one_cst (truth_type_for (vectype_in));
7135         }
7136
7137       /* Handle MINUS by adding the negative.  */
7138       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
7139         {
7140           tree negated = make_ssa_name (vectype_out);
7141           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
7142           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
7143           def0 = negated;
7144         }
7145
7146       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
7147           && mask && mask_reduc_fn == IFN_LAST)
7148         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
7149                                     vector_identity);
7150
7151       /* On the first iteration the input is simply the scalar phi
7152          result, and for subsequent iterations it is the output of
7153          the preceding operation.  */
7154       if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
7155         {
7156           if (mask && len && mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
7157             new_stmt = gimple_build_call_internal (mask_reduc_fn, 5, reduc_var,
7158                                                    def0, mask, len, bias);
7159           else if (mask && mask_reduc_fn == IFN_MASK_FOLD_LEFT_PLUS)
7160             new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
7161                                                    def0, mask);
7162           else
7163             new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
7164                                                    def0);
7165           /* For chained SLP reductions the output of the previous reduction
7166              operation serves as the input of the next. For the final statement
7167              the output cannot be a temporary - we reuse the original
7168              scalar destination of the last statement.  */
7169           if (i != vec_num - 1)
7170             {
7171               gimple_set_lhs (new_stmt, scalar_dest_var);
7172               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
7173               gimple_set_lhs (new_stmt, reduc_var);
7174             }
7175         }
7176       else
7177         {
7178           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var,
7179                                              tree_code (code), reduc_var, def0,
7180                                              mask);
7181           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
7182           /* Remove the statement, so that we can use the same code paths
7183              as for statements that we've just created.  */
7184           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
7185           gsi_remove (&tmp_gsi, true);
7186         }
7187
7188       if (i == vec_num - 1)
7189         {
7190           gimple_set_lhs (new_stmt, scalar_dest);
7191           vect_finish_replace_stmt (loop_vinfo,
7192                                     scalar_dest_def_info,
7193                                     new_stmt);
7194         }
7195       else
7196         vect_finish_stmt_generation (loop_vinfo,
7197                                      scalar_dest_def_info,
7198                                      new_stmt, gsi);
7199
7200       if (slp_node)
7201         slp_node->push_vec_def (new_stmt);
7202       else
7203         {
7204           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7205           *vec_stmt = new_stmt;
7206         }
7207     }
7208
7209   return true;
7210 }
7211
7212 /* Function is_nonwrapping_integer_induction.
7213
7214    Check if STMT_VINO (which is part of loop LOOP) both increments and
7215    does not cause overflow.  */
7216
7217 static bool
7218 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
7219 {
7220   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
7221   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
7222   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
7223   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
7224   widest_int ni, max_loop_value, lhs_max;
7225   wi::overflow_type overflow = wi::OVF_NONE;
7226
7227   /* Make sure the loop is integer based.  */
7228   if (TREE_CODE (base) != INTEGER_CST
7229       || TREE_CODE (step) != INTEGER_CST)
7230     return false;
7231
7232   /* Check that the max size of the loop will not wrap.  */
7233
7234   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
7235     return true;
7236
7237   if (! max_stmt_executions (loop, &ni))
7238     return false;
7239
7240   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
7241                             &overflow);
7242   if (overflow)
7243     return false;
7244
7245   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
7246                             TYPE_SIGN (lhs_type), &overflow);
7247   if (overflow)
7248     return false;
7249
7250   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
7251           <= TYPE_PRECISION (lhs_type));
7252 }
7253
7254 /* Check if masking can be supported by inserting a conditional expression.
7255    CODE is the code for the operation.  COND_FN is the conditional internal
7256    function, if it exists.  VECTYPE_IN is the type of the vector input.  */
7257 static bool
7258 use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
7259                          tree vectype_in)
7260 {
7261   if (cond_fn != IFN_LAST
7262       && direct_internal_fn_supported_p (cond_fn, vectype_in,
7263                                          OPTIMIZE_FOR_SPEED))
7264     return false;
7265
7266   if (code.is_tree_code ())
7267     switch (tree_code (code))
7268       {
7269       case DOT_PROD_EXPR:
7270       case SAD_EXPR:
7271         return true;
7272
7273       default:
7274         break;
7275       }
7276   return false;
7277 }
7278
7279 /* Insert a conditional expression to enable masked vectorization.  CODE is the
7280    code for the operation.  VOP is the array of operands.  MASK is the loop
7281    mask.  GSI is a statement iterator used to place the new conditional
7282    expression.  */
7283 static void
7284 build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
7285                       gimple_stmt_iterator *gsi)
7286 {
7287   switch (tree_code (code))
7288     {
7289     case DOT_PROD_EXPR:
7290       {
7291         tree vectype = TREE_TYPE (vop[1]);
7292         tree zero = build_zero_cst (vectype);
7293         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7294         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7295                                                mask, vop[1], zero);
7296         gsi_insert_before (gsi, select, GSI_SAME_STMT);
7297         vop[1] = masked_op1;
7298         break;
7299       }
7300
7301     case SAD_EXPR:
7302       {
7303         tree vectype = TREE_TYPE (vop[1]);
7304         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7305         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7306                                                mask, vop[1], vop[0]);
7307         gsi_insert_before (gsi, select, GSI_SAME_STMT);
7308         vop[1] = masked_op1;
7309         break;
7310       }
7311
7312     default:
7313       gcc_unreachable ();
7314     }
7315 }
7316
7317 /* Function vectorizable_reduction.
7318
7319    Check if STMT_INFO performs a reduction operation that can be vectorized.
7320    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
7321    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
7322    Return true if STMT_INFO is vectorizable in this way.
7323
7324    This function also handles reduction idioms (patterns) that have been
7325    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
7326    may be of this form:
7327      X = pattern_expr (arg0, arg1, ..., X)
7328    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
7329    sequence that had been detected and replaced by the pattern-stmt
7330    (STMT_INFO).
7331
7332    This function also handles reduction of condition expressions, for example:
7333      for (int i = 0; i < N; i++)
7334        if (a[i] < value)
7335          last = a[i];
7336    This is handled by vectorising the loop and creating an additional vector
7337    containing the loop indexes for which "a[i] < value" was true.  In the
7338    function epilogue this is reduced to a single max value and then used to
7339    index into the vector of results.
7340
7341    In some cases of reduction patterns, the type of the reduction variable X is
7342    different than the type of the other arguments of STMT_INFO.
7343    In such cases, the vectype that is used when transforming STMT_INFO into
7344    a vector stmt is different than the vectype that is used to determine the
7345    vectorization factor, because it consists of a different number of elements
7346    than the actual number of elements that are being operated upon in parallel.
7347
7348    For example, consider an accumulation of shorts into an int accumulator.
7349    On some targets it's possible to vectorize this pattern operating on 8
7350    shorts at a time (hence, the vectype for purposes of determining the
7351    vectorization factor should be V8HI); on the other hand, the vectype that
7352    is used to create the vector form is actually V4SI (the type of the result).
7353
7354    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
7355    indicates what is the actual level of parallelism (V8HI in the example), so
7356    that the right vectorization factor would be derived.  This vectype
7357    corresponds to the type of arguments to the reduction stmt, and should *NOT*
7358    be used to create the vectorized stmt.  The right vectype for the vectorized
7359    stmt is obtained from the type of the result X:
7360       get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7361
7362    This means that, contrary to "regular" reductions (or "regular" stmts in
7363    general), the following equation:
7364       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7365    does *NOT* necessarily hold for reduction patterns.  */
7366
7367 bool
7368 vectorizable_reduction (loop_vec_info loop_vinfo,
7369                         stmt_vec_info stmt_info, slp_tree slp_node,
7370                         slp_instance slp_node_instance,
7371                         stmt_vector_for_cost *cost_vec)
7372 {
7373   tree vectype_in = NULL_TREE;
7374   tree vectype_op[3] = { NULL_TREE, NULL_TREE, NULL_TREE };
7375   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7376   enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
7377   stmt_vec_info cond_stmt_vinfo = NULL;
7378   int i;
7379   int ncopies;
7380   bool single_defuse_cycle = false;
7381   bool nested_cycle = false;
7382   bool double_reduc = false;
7383   int vec_num;
7384   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
7385   tree cond_reduc_val = NULL_TREE;
7386
7387   /* Make sure it was already recognized as a reduction computation.  */
7388   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
7389       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
7390       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
7391     return false;
7392
7393   /* The stmt we store reduction analysis meta on.  */
7394   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7395   reduc_info->is_reduc_info = true;
7396
7397   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
7398     {
7399       if (is_a <gphi *> (stmt_info->stmt))
7400         {
7401           if (slp_node)
7402             {
7403               /* We eventually need to set a vector type on invariant
7404                  arguments.  */
7405               unsigned j;
7406               slp_tree child;
7407               FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
7408                 if (!vect_maybe_update_slp_op_vectype
7409                        (child, SLP_TREE_VECTYPE (slp_node)))
7410                   {
7411                     if (dump_enabled_p ())
7412                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7413                                        "incompatible vector types for "
7414                                        "invariants\n");
7415                     return false;
7416                   }
7417             }
7418           /* Analysis for double-reduction is done on the outer
7419              loop PHI, nested cycles have no further restrictions.  */
7420           STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
7421         }
7422       else
7423         STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7424       return true;
7425     }
7426
7427   stmt_vec_info orig_stmt_of_analysis = stmt_info;
7428   stmt_vec_info phi_info = stmt_info;
7429   if (!is_a <gphi *> (stmt_info->stmt))
7430     {
7431       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7432       return true;
7433     }
7434   if (slp_node)
7435     {
7436       slp_node_instance->reduc_phis = slp_node;
7437       /* ???  We're leaving slp_node to point to the PHIs, we only
7438          need it to get at the number of vector stmts which wasn't
7439          yet initialized for the instance root.  */
7440     }
7441   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7442     {
7443       use_operand_p use_p;
7444       gimple *use_stmt;
7445       bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
7446                                  &use_p, &use_stmt);
7447       gcc_assert (res);
7448       phi_info = loop_vinfo->lookup_stmt (use_stmt);
7449     }
7450
7451   /* PHIs should not participate in patterns.  */
7452   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7453   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7454
7455   /* Verify following REDUC_IDX from the latch def leads us back to the PHI
7456      and compute the reduction chain length.  Discover the real
7457      reduction operation stmt on the way (stmt_info and slp_for_stmt_info).  */
7458   tree reduc_def
7459     = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
7460                              loop_latch_edge
7461                                (gimple_bb (reduc_def_phi)->loop_father));
7462   unsigned reduc_chain_length = 0;
7463   bool only_slp_reduc_chain = true;
7464   stmt_info = NULL;
7465   slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
7466   while (reduc_def != PHI_RESULT (reduc_def_phi))
7467     {
7468       stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
7469       stmt_vec_info vdef = vect_stmt_to_vectorize (def);
7470       if (STMT_VINFO_REDUC_IDX (vdef) == -1)
7471         {
7472           if (dump_enabled_p ())
7473             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7474                              "reduction chain broken by patterns.\n");
7475           return false;
7476         }
7477       if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
7478         only_slp_reduc_chain = false;
7479       /* For epilogue generation live members of the chain need
7480          to point back to the PHI via their original stmt for
7481          info_for_reduction to work.  For SLP we need to look at
7482          all lanes here - even though we only will vectorize from
7483          the SLP node with live lane zero the other live lanes also
7484          need to be identified as part of a reduction to be able
7485          to skip code generation for them.  */
7486       if (slp_for_stmt_info)
7487         {
7488           for (auto s : SLP_TREE_SCALAR_STMTS (slp_for_stmt_info))
7489             if (STMT_VINFO_LIVE_P (s))
7490               STMT_VINFO_REDUC_DEF (vect_orig_stmt (s)) = phi_info;
7491         }
7492       else if (STMT_VINFO_LIVE_P (vdef))
7493         STMT_VINFO_REDUC_DEF (def) = phi_info;
7494       gimple_match_op op;
7495       if (!gimple_extract_op (vdef->stmt, &op))
7496         {
7497           if (dump_enabled_p ())
7498             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7499                              "reduction chain includes unsupported"
7500                              " statement type.\n");
7501           return false;
7502         }
7503       if (CONVERT_EXPR_CODE_P (op.code))
7504         {
7505           if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
7506             {
7507               if (dump_enabled_p ())
7508                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7509                                  "conversion in the reduction chain.\n");
7510               return false;
7511             }
7512         }
7513       else if (!stmt_info)
7514         /* First non-conversion stmt.  */
7515         stmt_info = vdef;
7516       reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
7517       reduc_chain_length++;
7518       if (!stmt_info && slp_node)
7519         slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
7520     }
7521   /* PHIs should not participate in patterns.  */
7522   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7523
7524   if (nested_in_vect_loop_p (loop, stmt_info))
7525     {
7526       loop = loop->inner;
7527       nested_cycle = true;
7528     }
7529
7530   /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
7531      element.  */
7532   if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7533     {
7534       gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
7535       stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
7536     }
7537   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7538     gcc_assert (slp_node
7539                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
7540
7541   /* 1. Is vectorizable reduction?  */
7542   /* Not supportable if the reduction variable is used in the loop, unless
7543      it's a reduction chain.  */
7544   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
7545       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7546     return false;
7547
7548   /* Reductions that are not used even in an enclosing outer-loop,
7549      are expected to be "live" (used out of the loop).  */
7550   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
7551       && !STMT_VINFO_LIVE_P (stmt_info))
7552     return false;
7553
7554   /* 2. Has this been recognized as a reduction pattern?
7555
7556      Check if STMT represents a pattern that has been recognized
7557      in earlier analysis stages.  For stmts that represent a pattern,
7558      the STMT_VINFO_RELATED_STMT field records the last stmt in
7559      the original sequence that constitutes the pattern.  */
7560
7561   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
7562   if (orig_stmt_info)
7563     {
7564       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
7565       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
7566     }
7567
7568   /* 3. Check the operands of the operation.  The first operands are defined
7569         inside the loop body. The last operand is the reduction variable,
7570         which is defined by the loop-header-phi.  */
7571
7572   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7573   STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
7574   gimple_match_op op;
7575   if (!gimple_extract_op (stmt_info->stmt, &op))
7576     gcc_unreachable ();
7577   bool lane_reduc_code_p = (op.code == DOT_PROD_EXPR
7578                             || op.code == WIDEN_SUM_EXPR
7579                             || op.code == SAD_EXPR);
7580
7581   if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
7582       && !SCALAR_FLOAT_TYPE_P (op.type))
7583     return false;
7584
7585   /* Do not try to vectorize bit-precision reductions.  */
7586   if (!type_has_mode_precision_p (op.type))
7587     return false;
7588
7589   /* For lane-reducing ops we're reducing the number of reduction PHIs
7590      which means the only use of that may be in the lane-reducing operation.  */
7591   if (lane_reduc_code_p
7592       && reduc_chain_length != 1
7593       && !only_slp_reduc_chain)
7594     {
7595       if (dump_enabled_p ())
7596         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7597                          "lane-reducing reduction with extra stmts.\n");
7598       return false;
7599     }
7600
7601   /* All uses but the last are expected to be defined in the loop.
7602      The last use is the reduction variable.  In case of nested cycle this
7603      assumption is not true: we use reduc_index to record the index of the
7604      reduction variable.  */
7605   slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
7606   /* We need to skip an extra operand for COND_EXPRs with embedded
7607      comparison.  */
7608   unsigned opno_adjust = 0;
7609   if (op.code == COND_EXPR && COMPARISON_CLASS_P (op.ops[0]))
7610     opno_adjust = 1;
7611   for (i = 0; i < (int) op.num_ops; i++)
7612     {
7613       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
7614       if (i == 0 && op.code == COND_EXPR)
7615         continue;
7616
7617       stmt_vec_info def_stmt_info;
7618       enum vect_def_type dt;
7619       if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
7620                                i + opno_adjust, &op.ops[i], &slp_op[i], &dt,
7621                                &vectype_op[i], &def_stmt_info))
7622         {
7623           if (dump_enabled_p ())
7624             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7625                              "use not simple.\n");
7626           return false;
7627         }
7628       if (i == STMT_VINFO_REDUC_IDX (stmt_info))
7629         continue;
7630
7631       /* For an IFN_COND_OP we might hit the reduction definition operand
7632          twice (once as definition, once as else).  */
7633       if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
7634         continue;
7635
7636       /* There should be only one cycle def in the stmt, the one
7637          leading to reduc_def.  */
7638       if (VECTORIZABLE_CYCLE_DEF (dt))
7639         return false;
7640
7641       if (!vectype_op[i])
7642         vectype_op[i]
7643           = get_vectype_for_scalar_type (loop_vinfo,
7644                                          TREE_TYPE (op.ops[i]), slp_op[i]);
7645
7646       /* To properly compute ncopies we are interested in the widest
7647          non-reduction input type in case we're looking at a widening
7648          accumulation that we later handle in vect_transform_reduction.  */
7649       if (lane_reduc_code_p
7650           && vectype_op[i]
7651           && (!vectype_in
7652               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
7653                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_op[i]))))))
7654         vectype_in = vectype_op[i];
7655
7656       if (op.code == COND_EXPR)
7657         {
7658           /* Record how the non-reduction-def value of COND_EXPR is defined.  */
7659           if (dt == vect_constant_def)
7660             {
7661               cond_reduc_dt = dt;
7662               cond_reduc_val = op.ops[i];
7663             }
7664           if (dt == vect_induction_def
7665               && def_stmt_info
7666               && is_nonwrapping_integer_induction (def_stmt_info, loop))
7667             {
7668               cond_reduc_dt = dt;
7669               cond_stmt_vinfo = def_stmt_info;
7670             }
7671         }
7672     }
7673   if (!vectype_in)
7674     vectype_in = STMT_VINFO_VECTYPE (phi_info);
7675   STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
7676
7677   enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
7678   STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
7679   /* If we have a condition reduction, see if we can simplify it further.  */
7680   if (v_reduc_type == COND_REDUCTION)
7681     {
7682       if (slp_node)
7683         return false;
7684
7685       /* When the condition uses the reduction value in the condition, fail.  */
7686       if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
7687         {
7688           if (dump_enabled_p ())
7689             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7690                              "condition depends on previous iteration\n");
7691           return false;
7692         }
7693
7694       if (reduc_chain_length == 1
7695           && (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST, vectype_in,
7696                                               OPTIMIZE_FOR_SPEED)
7697               || direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
7698                                                  vectype_in,
7699                                                  OPTIMIZE_FOR_SPEED)))
7700         {
7701           if (dump_enabled_p ())
7702             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7703                              "optimizing condition reduction with"
7704                              " FOLD_EXTRACT_LAST.\n");
7705           STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
7706         }
7707       else if (cond_reduc_dt == vect_induction_def)
7708         {
7709           tree base
7710             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
7711           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
7712
7713           gcc_assert (TREE_CODE (base) == INTEGER_CST
7714                       && TREE_CODE (step) == INTEGER_CST);
7715           cond_reduc_val = NULL_TREE;
7716           enum tree_code cond_reduc_op_code = ERROR_MARK;
7717           tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
7718           if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
7719             ;
7720           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
7721              above base; punt if base is the minimum value of the type for
7722              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
7723           else if (tree_int_cst_sgn (step) == -1)
7724             {
7725               cond_reduc_op_code = MIN_EXPR;
7726               if (tree_int_cst_sgn (base) == -1)
7727                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7728               else if (tree_int_cst_lt (base,
7729                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
7730                 cond_reduc_val
7731                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
7732             }
7733           else
7734             {
7735               cond_reduc_op_code = MAX_EXPR;
7736               if (tree_int_cst_sgn (base) == 1)
7737                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7738               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
7739                                         base))
7740                 cond_reduc_val
7741                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
7742             }
7743           if (cond_reduc_val)
7744             {
7745               if (dump_enabled_p ())
7746                 dump_printf_loc (MSG_NOTE, vect_location,
7747                                  "condition expression based on "
7748                                  "integer induction.\n");
7749               STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
7750               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
7751                 = cond_reduc_val;
7752               STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
7753             }
7754         }
7755       else if (cond_reduc_dt == vect_constant_def)
7756         {
7757           enum vect_def_type cond_initial_dt;
7758           tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
7759           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
7760           if (cond_initial_dt == vect_constant_def
7761               && types_compatible_p (TREE_TYPE (cond_initial_val),
7762                                      TREE_TYPE (cond_reduc_val)))
7763             {
7764               tree e = fold_binary (LE_EXPR, boolean_type_node,
7765                                     cond_initial_val, cond_reduc_val);
7766               if (e && (integer_onep (e) || integer_zerop (e)))
7767                 {
7768                   if (dump_enabled_p ())
7769                     dump_printf_loc (MSG_NOTE, vect_location,
7770                                      "condition expression based on "
7771                                      "compile time constant.\n");
7772                   /* Record reduction code at analysis stage.  */
7773                   STMT_VINFO_REDUC_CODE (reduc_info)
7774                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
7775                   STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
7776                 }
7777             }
7778         }
7779     }
7780
7781   if (STMT_VINFO_LIVE_P (phi_info))
7782     return false;
7783
7784   if (slp_node)
7785     ncopies = 1;
7786   else
7787     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7788
7789   gcc_assert (ncopies >= 1);
7790
7791   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
7792
7793   if (nested_cycle)
7794     {
7795       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
7796                   == vect_double_reduction_def);
7797       double_reduc = true;
7798     }
7799
7800   /* 4.2. Check support for the epilog operation.
7801
7802           If STMT represents a reduction pattern, then the type of the
7803           reduction variable may be different than the type of the rest
7804           of the arguments.  For example, consider the case of accumulation
7805           of shorts into an int accumulator; The original code:
7806                         S1: int_a = (int) short_a;
7807           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
7808
7809           was replaced with:
7810                         STMT: int_acc = widen_sum <short_a, int_acc>
7811
7812           This means that:
7813           1. The tree-code that is used to create the vector operation in the
7814              epilog code (that reduces the partial results) is not the
7815              tree-code of STMT, but is rather the tree-code of the original
7816              stmt from the pattern that STMT is replacing.  I.e, in the example
7817              above we want to use 'widen_sum' in the loop, but 'plus' in the
7818              epilog.
7819           2. The type (mode) we use to check available target support
7820              for the vector operation to be created in the *epilog*, is
7821              determined by the type of the reduction variable (in the example
7822              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7823              However the type (mode) we use to check available target support
7824              for the vector operation to be created *inside the loop*, is
7825              determined by the type of the other arguments to STMT (in the
7826              example we'd check this: optab_handler (widen_sum_optab,
7827              vect_short_mode)).
7828
7829           This is contrary to "regular" reductions, in which the types of all
7830           the arguments are the same as the type of the reduction variable.
7831           For "regular" reductions we can therefore use the same vector type
7832           (and also the same tree-code) when generating the epilog code and
7833           when generating the code inside the loop.  */
7834
7835   code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
7836
7837   /* If conversion might have created a conditional operation like
7838      IFN_COND_ADD already.  Use the internal code for the following checks.  */
7839   if (orig_code.is_internal_fn ())
7840     {
7841       tree_code new_code = conditional_internal_fn_code (internal_fn (orig_code));
7842       orig_code = new_code != ERROR_MARK ? new_code : orig_code;
7843     }
7844
7845   STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
7846
7847   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7848   if (reduction_type == TREE_CODE_REDUCTION)
7849     {
7850       /* Check whether it's ok to change the order of the computation.
7851          Generally, when vectorizing a reduction we change the order of the
7852          computation.  This may change the behavior of the program in some
7853          cases, so we need to check that this is ok.  One exception is when
7854          vectorizing an outer-loop: the inner-loop is executed sequentially,
7855          and therefore vectorizing reductions in the inner-loop during
7856          outer-loop vectorization is safe.  Likewise when we are vectorizing
7857          a series of reductions using SLP and the VF is one the reductions
7858          are performed in scalar order.  */
7859       if (slp_node
7860           && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7861           && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
7862         ;
7863       else if (needs_fold_left_reduction_p (op.type, orig_code))
7864         {
7865           /* When vectorizing a reduction chain w/o SLP the reduction PHI
7866              is not directy used in stmt.  */
7867           if (!only_slp_reduc_chain
7868               && reduc_chain_length != 1)
7869             {
7870               if (dump_enabled_p ())
7871                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7872                                  "in-order reduction chain without SLP.\n");
7873               return false;
7874             }
7875           STMT_VINFO_REDUC_TYPE (reduc_info)
7876             = reduction_type = FOLD_LEFT_REDUCTION;
7877         }
7878       else if (!commutative_binary_op_p (orig_code, op.type)
7879                || !associative_binary_op_p (orig_code, op.type))
7880         {
7881           if (dump_enabled_p ())
7882             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7883                             "reduction: not commutative/associative\n");
7884           return false;
7885         }
7886     }
7887
7888   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7889       && ncopies > 1)
7890     {
7891       if (dump_enabled_p ())
7892         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7893                          "multiple types in double reduction or condition "
7894                          "reduction or fold-left reduction.\n");
7895       return false;
7896     }
7897
7898   internal_fn reduc_fn = IFN_LAST;
7899   if (reduction_type == TREE_CODE_REDUCTION
7900       || reduction_type == FOLD_LEFT_REDUCTION
7901       || reduction_type == INTEGER_INDUC_COND_REDUCTION
7902       || reduction_type == CONST_COND_REDUCTION)
7903     {
7904       if (reduction_type == FOLD_LEFT_REDUCTION
7905           ? fold_left_reduction_fn (orig_code, &reduc_fn)
7906           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7907         {
7908           if (reduc_fn != IFN_LAST
7909               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
7910                                                   OPTIMIZE_FOR_SPEED))
7911             {
7912               if (dump_enabled_p ())
7913                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7914                                  "reduc op not supported by target.\n");
7915
7916               reduc_fn = IFN_LAST;
7917             }
7918         }
7919       else
7920         {
7921           if (!nested_cycle || double_reduc)
7922             {
7923               if (dump_enabled_p ())
7924                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7925                                  "no reduc code for scalar code.\n");
7926
7927               return false;
7928             }
7929         }
7930     }
7931   else if (reduction_type == COND_REDUCTION)
7932     {
7933       int scalar_precision
7934         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
7935       cr_index_scalar_type = make_unsigned_type (scalar_precision);
7936       cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
7937                                                 vectype_out);
7938
7939       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7940                                           OPTIMIZE_FOR_SPEED))
7941         reduc_fn = IFN_REDUC_MAX;
7942     }
7943   STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
7944
7945   if (reduction_type != EXTRACT_LAST_REDUCTION
7946       && (!nested_cycle || double_reduc)
7947       && reduc_fn == IFN_LAST
7948       && !nunits_out.is_constant ())
7949     {
7950       if (dump_enabled_p ())
7951         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7952                          "missing target support for reduction on"
7953                          " variable-length vectors.\n");
7954       return false;
7955     }
7956
7957   /* For SLP reductions, see if there is a neutral value we can use.  */
7958   tree neutral_op = NULL_TREE;
7959   if (slp_node)
7960     {
7961       tree initial_value = NULL_TREE;
7962       if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
7963         initial_value = vect_phi_initial_value (reduc_def_phi);
7964       neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7965                                              orig_code, initial_value);
7966     }
7967
7968   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7969     {
7970       /* We can't support in-order reductions of code such as this:
7971
7972            for (int i = 0; i < n1; ++i)
7973              for (int j = 0; j < n2; ++j)
7974                l += a[j];
7975
7976          since GCC effectively transforms the loop when vectorizing:
7977
7978            for (int i = 0; i < n1 / VF; ++i)
7979              for (int j = 0; j < n2; ++j)
7980                for (int k = 0; k < VF; ++k)
7981                  l += a[j];
7982
7983          which is a reassociation of the original operation.  */
7984       if (dump_enabled_p ())
7985         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7986                          "in-order double reduction not supported.\n");
7987
7988       return false;
7989     }
7990
7991   if (reduction_type == FOLD_LEFT_REDUCTION
7992       && slp_node
7993       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7994     {
7995       /* We cannot use in-order reductions in this case because there is
7996          an implicit reassociation of the operations involved.  */
7997       if (dump_enabled_p ())
7998         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7999                          "in-order unchained SLP reductions not supported.\n");
8000       return false;
8001     }
8002
8003   /* For double reductions, and for SLP reductions with a neutral value,
8004      we construct a variable-length initial vector by loading a vector
8005      full of the neutral value and then shift-and-inserting the start
8006      values into the low-numbered elements.  */
8007   if ((double_reduc || neutral_op)
8008       && !nunits_out.is_constant ()
8009       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
8010                                           vectype_out, OPTIMIZE_FOR_SPEED))
8011     {
8012       if (dump_enabled_p ())
8013         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8014                          "reduction on variable-length vectors requires"
8015                          " target support for a vector-shift-and-insert"
8016                          " operation.\n");
8017       return false;
8018     }
8019
8020   /* Check extra constraints for variable-length unchained SLP reductions.  */
8021   if (slp_node
8022       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
8023       && !nunits_out.is_constant ())
8024     {
8025       /* We checked above that we could build the initial vector when
8026          there's a neutral element value.  Check here for the case in
8027          which each SLP statement has its own initial value and in which
8028          that value needs to be repeated for every instance of the
8029          statement within the initial vector.  */
8030       unsigned int group_size = SLP_TREE_LANES (slp_node);
8031       if (!neutral_op
8032           && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
8033                                               TREE_TYPE (vectype_out)))
8034         {
8035           if (dump_enabled_p ())
8036             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8037                              "unsupported form of SLP reduction for"
8038                              " variable-length vectors: cannot build"
8039                              " initial vector.\n");
8040           return false;
8041         }
8042       /* The epilogue code relies on the number of elements being a multiple
8043          of the group size.  The duplicate-and-interleave approach to setting
8044          up the initial vector does too.  */
8045       if (!multiple_p (nunits_out, group_size))
8046         {
8047           if (dump_enabled_p ())
8048             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8049                              "unsupported form of SLP reduction for"
8050                              " variable-length vectors: the vector size"
8051                              " is not a multiple of the number of results.\n");
8052           return false;
8053         }
8054     }
8055
8056   if (reduction_type == COND_REDUCTION)
8057     {
8058       widest_int ni;
8059
8060       if (! max_loop_iterations (loop, &ni))
8061         {
8062           if (dump_enabled_p ())
8063             dump_printf_loc (MSG_NOTE, vect_location,
8064                              "loop count not known, cannot create cond "
8065                              "reduction.\n");
8066           return false;
8067         }
8068       /* Convert backedges to iterations.  */
8069       ni += 1;
8070
8071       /* The additional index will be the same type as the condition.  Check
8072          that the loop can fit into this less one (because we'll use up the
8073          zero slot for when there are no matches).  */
8074       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
8075       if (wi::geu_p (ni, wi::to_widest (max_index)))
8076         {
8077           if (dump_enabled_p ())
8078             dump_printf_loc (MSG_NOTE, vect_location,
8079                              "loop size is greater than data size.\n");
8080           return false;
8081         }
8082     }
8083
8084   /* In case the vectorization factor (VF) is bigger than the number
8085      of elements that we can fit in a vectype (nunits), we have to generate
8086      more than one vector stmt - i.e - we need to "unroll" the
8087      vector stmt by a factor VF/nunits.  For more details see documentation
8088      in vectorizable_operation.  */
8089
8090   /* If the reduction is used in an outer loop we need to generate
8091      VF intermediate results, like so (e.g. for ncopies=2):
8092         r0 = phi (init, r0)
8093         r1 = phi (init, r1)
8094         r0 = x0 + r0;
8095         r1 = x1 + r1;
8096     (i.e. we generate VF results in 2 registers).
8097     In this case we have a separate def-use cycle for each copy, and therefore
8098     for each copy we get the vector def for the reduction variable from the
8099     respective phi node created for this copy.
8100
8101     Otherwise (the reduction is unused in the loop nest), we can combine
8102     together intermediate results, like so (e.g. for ncopies=2):
8103         r = phi (init, r)
8104         r = x0 + r;
8105         r = x1 + r;
8106    (i.e. we generate VF/2 results in a single register).
8107    In this case for each copy we get the vector def for the reduction variable
8108    from the vectorized reduction operation generated in the previous iteration.
8109
8110    This only works when we see both the reduction PHI and its only consumer
8111    in vectorizable_reduction and there are no intermediate stmts
8112    participating.  When unrolling we want each unrolled iteration to have its
8113    own reduction accumulator since one of the main goals of unrolling a
8114    reduction is to reduce the aggregate loop-carried latency.  */
8115   if (ncopies > 1
8116       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
8117       && reduc_chain_length == 1
8118       && loop_vinfo->suggested_unroll_factor == 1)
8119     single_defuse_cycle = true;
8120
8121   if (single_defuse_cycle || lane_reduc_code_p)
8122     {
8123       gcc_assert (op.code != COND_EXPR);
8124
8125       /* 4. Supportable by target?  */
8126       bool ok = true;
8127
8128       /* 4.1. check support for the operation in the loop
8129
8130          This isn't necessary for the lane reduction codes, since they
8131          can only be produced by pattern matching, and it's up to the
8132          pattern matcher to test for support.  The main reason for
8133          specifically skipping this step is to avoid rechecking whether
8134          mixed-sign dot-products can be implemented using signed
8135          dot-products.  */
8136       machine_mode vec_mode = TYPE_MODE (vectype_in);
8137       if (!lane_reduc_code_p
8138           && !directly_supported_p (op.code, vectype_in, optab_vector))
8139         {
8140           if (dump_enabled_p ())
8141             dump_printf (MSG_NOTE, "op not supported by target.\n");
8142           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
8143               || !vect_can_vectorize_without_simd_p (op.code))
8144             ok = false;
8145           else
8146             if (dump_enabled_p ())
8147               dump_printf (MSG_NOTE, "proceeding using word mode.\n");
8148         }
8149
8150       if (vect_emulated_vector_p (vectype_in)
8151           && !vect_can_vectorize_without_simd_p (op.code))
8152         {
8153           if (dump_enabled_p ())
8154             dump_printf (MSG_NOTE, "using word mode not possible.\n");
8155           return false;
8156         }
8157
8158       /* lane-reducing operations have to go through vect_transform_reduction.
8159          For the other cases try without the single cycle optimization.  */
8160       if (!ok)
8161         {
8162           if (lane_reduc_code_p)
8163             return false;
8164           else
8165             single_defuse_cycle = false;
8166         }
8167     }
8168   STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
8169
8170   /* If the reduction stmt is one of the patterns that have lane
8171      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
8172   if ((ncopies > 1 && ! single_defuse_cycle)
8173       && lane_reduc_code_p)
8174     {
8175       if (dump_enabled_p ())
8176         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8177                          "multi def-use cycle not possible for lane-reducing "
8178                          "reduction operation\n");
8179       return false;
8180     }
8181
8182   if (slp_node
8183       && !(!single_defuse_cycle
8184            && !lane_reduc_code_p
8185            && reduction_type != FOLD_LEFT_REDUCTION))
8186     for (i = 0; i < (int) op.num_ops; i++)
8187       if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
8188         {
8189           if (dump_enabled_p ())
8190             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8191                              "incompatible vector types for invariants\n");
8192           return false;
8193         }
8194
8195   if (slp_node)
8196     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8197   else
8198     vec_num = 1;
8199
8200   vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
8201                              reduction_type, ncopies, cost_vec);
8202   /* Cost the reduction op inside the loop if transformed via
8203      vect_transform_reduction.  Otherwise this is costed by the
8204      separate vectorizable_* routines.  */
8205   if (single_defuse_cycle || lane_reduc_code_p)
8206     {
8207       int factor = 1;
8208       if (vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info))
8209         /* Three dot-products and a subtraction.  */
8210         factor = 4;
8211       record_stmt_cost (cost_vec, ncopies * factor, vector_stmt,
8212                         stmt_info, 0, vect_body);
8213     }
8214
8215   if (dump_enabled_p ()
8216       && reduction_type == FOLD_LEFT_REDUCTION)
8217     dump_printf_loc (MSG_NOTE, vect_location,
8218                      "using an in-order (fold-left) reduction.\n");
8219   STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
8220   /* All but single defuse-cycle optimized, lane-reducing and fold-left
8221      reductions go through their own vectorizable_* routines.  */
8222   if (!single_defuse_cycle
8223       && !lane_reduc_code_p
8224       && reduction_type != FOLD_LEFT_REDUCTION)
8225     {
8226       stmt_vec_info tem
8227         = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
8228       if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
8229         {
8230           gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
8231           tem = REDUC_GROUP_FIRST_ELEMENT (tem);
8232         }
8233       STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
8234       STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
8235     }
8236   else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8237     {
8238       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8239       vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8240       internal_fn cond_fn = get_conditional_internal_fn (op.code, op.type);
8241
8242       if (reduction_type != FOLD_LEFT_REDUCTION
8243           && !use_mask_by_cond_expr_p (op.code, cond_fn, vectype_in)
8244           && (cond_fn == IFN_LAST
8245               || !direct_internal_fn_supported_p (cond_fn, vectype_in,
8246                                                   OPTIMIZE_FOR_SPEED)))
8247         {
8248           if (dump_enabled_p ())
8249             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8250                              "can't operate on partial vectors because"
8251                              " no conditional operation is available.\n");
8252           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8253         }
8254       else if (reduction_type == FOLD_LEFT_REDUCTION
8255                && reduc_fn == IFN_LAST
8256                && !expand_vec_cond_expr_p (vectype_in,
8257                                            truth_type_for (vectype_in),
8258                                            SSA_NAME))
8259         {
8260           if (dump_enabled_p ())
8261             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8262                              "can't operate on partial vectors because"
8263                              " no conditional operation is available.\n");
8264           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8265         }
8266       else if (reduction_type == FOLD_LEFT_REDUCTION
8267                && internal_fn_mask_index (reduc_fn) == -1
8268                && FLOAT_TYPE_P (vectype_in)
8269                && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in))
8270         {
8271           if (dump_enabled_p ())
8272             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8273                              "can't operate on partial vectors because"
8274                              " signed zeros cannot be preserved.\n");
8275           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8276         }
8277       else
8278         {
8279           internal_fn mask_reduc_fn
8280             = get_masked_reduction_fn (reduc_fn, vectype_in);
8281
8282           if (mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
8283             vect_record_loop_len (loop_vinfo, lens, ncopies * vec_num,
8284                                   vectype_in, 1);
8285           else
8286             vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
8287                                    vectype_in, NULL);
8288         }
8289     }
8290   return true;
8291 }
8292
8293 /* STMT_INFO is a dot-product reduction whose multiplication operands
8294    have different signs.  Emit a sequence to emulate the operation
8295    using a series of signed DOT_PROD_EXPRs and return the last
8296    statement generated.  VEC_DEST is the result of the vector operation
8297    and VOP lists its inputs.  */
8298
8299 static gassign *
8300 vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8301                              gimple_stmt_iterator *gsi, tree vec_dest,
8302                              tree vop[3])
8303 {
8304   tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest));
8305   tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0]));
8306   tree narrow_elttype = TREE_TYPE (narrow_vectype);
8307   gimple *new_stmt;
8308
8309   /* Make VOP[0] the unsigned operand VOP[1] the signed operand.  */
8310   if (!TYPE_UNSIGNED (TREE_TYPE (vop[0])))
8311     std::swap (vop[0], vop[1]);
8312
8313   /* Convert all inputs to signed types.  */
8314   for (int i = 0; i < 3; ++i)
8315     if (TYPE_UNSIGNED (TREE_TYPE (vop[i])))
8316       {
8317         tree tmp = make_ssa_name (signed_type_for (TREE_TYPE (vop[i])));
8318         new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]);
8319         vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8320         vop[i] = tmp;
8321       }
8322
8323   /* In the comments below we assume 8-bit inputs for simplicity,
8324      but the approach works for any full integer type.  */
8325
8326   /* Create a vector of -128.  */
8327   tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype);
8328   tree min_narrow = build_vector_from_val (narrow_vectype,
8329                                            min_narrow_elttype);
8330
8331   /* Create a vector of 64.  */
8332   auto half_wi = wi::lrshift (wi::to_wide (min_narrow_elttype), 1);
8333   tree half_narrow = wide_int_to_tree (narrow_elttype, half_wi);
8334   half_narrow = build_vector_from_val (narrow_vectype, half_narrow);
8335
8336   /* Emit: SUB_RES = VOP[0] - 128.  */
8337   tree sub_res = make_ssa_name (narrow_vectype);
8338   new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow);
8339   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8340
8341   /* Emit:
8342
8343        STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
8344        STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
8345        STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
8346
8347      on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
8348      Doing the two 64 * y steps first allows more time to compute x.  */
8349   tree stage1 = make_ssa_name (wide_vectype);
8350   new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR,
8351                                   vop[1], half_narrow, vop[2]);
8352   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8353
8354   tree stage2 = make_ssa_name (wide_vectype);
8355   new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR,
8356                                   vop[1], half_narrow, stage1);
8357   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8358
8359   tree stage3 = make_ssa_name (wide_vectype);
8360   new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR,
8361                                   sub_res, vop[1], stage2);
8362   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8363
8364   /* Convert STAGE3 to the reduction type.  */
8365   return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3);
8366 }
8367
8368 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
8369    value.  */
8370
8371 bool
8372 vect_transform_reduction (loop_vec_info loop_vinfo,
8373                           stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
8374                           gimple **vec_stmt, slp_tree slp_node)
8375 {
8376   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8377   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8378   int i;
8379   int ncopies;
8380   int vec_num;
8381
8382   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8383   gcc_assert (reduc_info->is_reduc_info);
8384
8385   if (nested_in_vect_loop_p (loop, stmt_info))
8386     {
8387       loop = loop->inner;
8388       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
8389     }
8390
8391   gimple_match_op op;
8392   if (!gimple_extract_op (stmt_info->stmt, &op))
8393     gcc_unreachable ();
8394
8395   /* All uses but the last are expected to be defined in the loop.
8396      The last use is the reduction variable.  In case of nested cycle this
8397      assumption is not true: we use reduc_index to record the index of the
8398      reduction variable.  */
8399   stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
8400   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
8401   int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
8402   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8403
8404   if (slp_node)
8405     {
8406       ncopies = 1;
8407       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8408     }
8409   else
8410     {
8411       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8412       vec_num = 1;
8413     }
8414
8415   code_helper code = canonicalize_code (op.code, op.type);
8416   internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
8417
8418   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8419   vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8420   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
8421
8422   /* Transform.  */
8423   tree new_temp = NULL_TREE;
8424   auto_vec<tree> vec_oprnds0;
8425   auto_vec<tree> vec_oprnds1;
8426   auto_vec<tree> vec_oprnds2;
8427   tree def0;
8428
8429   if (dump_enabled_p ())
8430     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
8431
8432   /* FORNOW: Multiple types are not supported for condition.  */
8433   if (code == COND_EXPR)
8434     gcc_assert (ncopies == 1);
8435
8436   /* A binary COND_OP reduction must have the same definition and else
8437      value. */
8438   bool cond_fn_p = code.is_internal_fn ()
8439     && conditional_internal_fn_code (internal_fn (code)) != ERROR_MARK;
8440   if (cond_fn_p)
8441     {
8442       gcc_assert (code == IFN_COND_ADD || code == IFN_COND_SUB
8443                   || code == IFN_COND_MUL || code == IFN_COND_AND
8444                   || code == IFN_COND_IOR || code == IFN_COND_XOR);
8445       gcc_assert (op.num_ops == 4 && (op.ops[1] == op.ops[3]));
8446     }
8447
8448   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8449
8450   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
8451   if (reduction_type == FOLD_LEFT_REDUCTION)
8452     {
8453       internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
8454       gcc_assert (code.is_tree_code () || cond_fn_p);
8455       return vectorize_fold_left_reduction
8456           (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
8457            code, reduc_fn, op.ops, op.num_ops, vectype_in,
8458            reduc_index, masks, lens);
8459     }
8460
8461   bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
8462   gcc_assert (single_defuse_cycle
8463               || code == DOT_PROD_EXPR
8464               || code == WIDEN_SUM_EXPR
8465               || code == SAD_EXPR);
8466
8467   /* Create the destination vector  */
8468   tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
8469   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
8470
8471   /* Get NCOPIES vector definitions for all operands except the reduction
8472      definition.  */
8473   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
8474                      single_defuse_cycle && reduc_index == 0
8475                      ? NULL_TREE : op.ops[0], &vec_oprnds0,
8476                      single_defuse_cycle && reduc_index == 1
8477                      ? NULL_TREE : op.ops[1], &vec_oprnds1,
8478                      op.num_ops == 4
8479                      || (op.num_ops == 3
8480                          && !(single_defuse_cycle && reduc_index == 2))
8481                      ? op.ops[2] : NULL_TREE, &vec_oprnds2);
8482
8483   /* For single def-use cycles get one copy of the vectorized reduction
8484      definition.  */
8485   if (single_defuse_cycle)
8486     {
8487       gcc_assert (!slp_node);
8488       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8489                                      op.ops[reduc_index],
8490                                      reduc_index == 0 ? &vec_oprnds0
8491                                      : (reduc_index == 1 ? &vec_oprnds1
8492                                         : &vec_oprnds2));
8493     }
8494
8495   bool emulated_mixed_dot_prod
8496     = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
8497   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
8498     {
8499       gimple *new_stmt;
8500       tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
8501       if (masked_loop_p && !mask_by_cond_expr)
8502         {
8503           /* No conditional ifns have been defined for dot-product yet.  */
8504           gcc_assert (code != DOT_PROD_EXPR);
8505
8506           /* Make sure that the reduction accumulator is vop[0].  */
8507           if (reduc_index == 1)
8508             {
8509               gcc_assert (commutative_binary_op_p (code, op.type));
8510               std::swap (vop[0], vop[1]);
8511             }
8512           tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8513                                           vec_num * ncopies, vectype_in, i);
8514           gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
8515                                                     vop[0], vop[1], vop[0]);
8516           new_temp = make_ssa_name (vec_dest, call);
8517           gimple_call_set_lhs (call, new_temp);
8518           gimple_call_set_nothrow (call, true);
8519           vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
8520           new_stmt = call;
8521         }
8522       else
8523         {
8524           if (op.num_ops >= 3)
8525             vop[2] = vec_oprnds2[i];
8526
8527           if (masked_loop_p && mask_by_cond_expr)
8528             {
8529               tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8530                                               vec_num * ncopies, vectype_in, i);
8531               build_vect_cond_expr (code, vop, mask, gsi);
8532             }
8533
8534           if (emulated_mixed_dot_prod)
8535             new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
8536                                                     vec_dest, vop);
8537
8538           else if (code.is_internal_fn () && !cond_fn_p)
8539             new_stmt = gimple_build_call_internal (internal_fn (code),
8540                                                    op.num_ops,
8541                                                    vop[0], vop[1], vop[2]);
8542           else if (code.is_internal_fn () && cond_fn_p)
8543             new_stmt = gimple_build_call_internal (internal_fn (code),
8544                                                    op.num_ops,
8545                                                    vop[0], vop[1], vop[2],
8546                                                    vop[1]);
8547           else
8548             new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
8549                                             vop[0], vop[1], vop[2]);
8550           new_temp = make_ssa_name (vec_dest, new_stmt);
8551           gimple_set_lhs (new_stmt, new_temp);
8552           vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8553         }
8554
8555       if (slp_node)
8556         slp_node->push_vec_def (new_stmt);
8557       else if (single_defuse_cycle
8558                && i < ncopies - 1)
8559         {
8560           if (reduc_index == 0)
8561             vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
8562           else if (reduc_index == 1)
8563             vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
8564           else if (reduc_index == 2)
8565             vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
8566         }
8567       else
8568         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8569     }
8570
8571   if (!slp_node)
8572     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8573
8574   return true;
8575 }
8576
8577 /* Transform phase of a cycle PHI.  */
8578
8579 bool
8580 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
8581                           stmt_vec_info stmt_info, gimple **vec_stmt,
8582                           slp_tree slp_node, slp_instance slp_node_instance)
8583 {
8584   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8585   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8586   int i;
8587   int ncopies;
8588   int j;
8589   bool nested_cycle = false;
8590   int vec_num;
8591
8592   if (nested_in_vect_loop_p (loop, stmt_info))
8593     {
8594       loop = loop->inner;
8595       nested_cycle = true;
8596     }
8597
8598   stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
8599   reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
8600   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8601   gcc_assert (reduc_info->is_reduc_info);
8602
8603   if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
8604       || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
8605     /* Leave the scalar phi in place.  */
8606     return true;
8607
8608   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8609   /* For a nested cycle we do not fill the above.  */
8610   if (!vectype_in)
8611     vectype_in = STMT_VINFO_VECTYPE (stmt_info);
8612   gcc_assert (vectype_in);
8613
8614   if (slp_node)
8615     {
8616       /* The size vect_schedule_slp_instance computes is off for us.  */
8617       vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
8618                                       * SLP_TREE_LANES (slp_node), vectype_in);
8619       ncopies = 1;
8620     }
8621   else
8622     {
8623       vec_num = 1;
8624       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8625     }
8626
8627   /* Check whether we should use a single PHI node and accumulate
8628      vectors to one before the backedge.  */
8629   if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
8630     ncopies = 1;
8631
8632   /* Create the destination vector  */
8633   gphi *phi = as_a <gphi *> (stmt_info->stmt);
8634   tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
8635                                                vectype_out);
8636
8637   /* Get the loop-entry arguments.  */
8638   tree vec_initial_def = NULL_TREE;
8639   auto_vec<tree> vec_initial_defs;
8640   if (slp_node)
8641     {
8642       vec_initial_defs.reserve (vec_num);
8643       if (nested_cycle)
8644         {
8645           unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
8646           vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
8647                              &vec_initial_defs);
8648         }
8649       else
8650         {
8651           gcc_assert (slp_node == slp_node_instance->reduc_phis);
8652           vec<tree> &initial_values = reduc_info->reduc_initial_values;
8653           vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
8654
8655           unsigned int num_phis = stmts.length ();
8656           if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
8657             num_phis = 1;
8658           initial_values.reserve (num_phis);
8659           for (unsigned int i = 0; i < num_phis; ++i)
8660             {
8661               gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
8662               initial_values.quick_push (vect_phi_initial_value (this_phi));
8663             }
8664           if (vec_num == 1)
8665             vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8666           if (!initial_values.is_empty ())
8667             {
8668               tree initial_value
8669                 = (num_phis == 1 ? initial_values[0] : NULL_TREE);
8670               code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8671               tree neutral_op
8672                 = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8673                                             code, initial_value);
8674               get_initial_defs_for_reduction (loop_vinfo, reduc_info,
8675                                               &vec_initial_defs, vec_num,
8676                                               stmts.length (), neutral_op);
8677             }
8678         }
8679     }
8680   else
8681     {
8682       /* Get at the scalar def before the loop, that defines the initial
8683          value of the reduction variable.  */
8684       tree initial_def = vect_phi_initial_value (phi);
8685       reduc_info->reduc_initial_values.safe_push (initial_def);
8686       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
8687          and we can't use zero for induc_val, use initial_def.  Similarly
8688          for REDUC_MIN and initial_def larger than the base.  */
8689       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
8690         {
8691           tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
8692           if (TREE_CODE (initial_def) == INTEGER_CST
8693               && !integer_zerop (induc_val)
8694               && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
8695                    && tree_int_cst_lt (initial_def, induc_val))
8696                   || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
8697                       && tree_int_cst_lt (induc_val, initial_def))))
8698             {
8699               induc_val = initial_def;
8700               /* Communicate we used the initial_def to epilouge
8701                  generation.  */
8702               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
8703             }
8704           vec_initial_def = build_vector_from_val (vectype_out, induc_val);
8705         }
8706       else if (nested_cycle)
8707         {
8708           /* Do not use an adjustment def as that case is not supported
8709              correctly if ncopies is not one.  */
8710           vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
8711                                          ncopies, initial_def,
8712                                          &vec_initial_defs);
8713         }
8714       else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
8715                || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
8716         /* Fill the initial vector with the initial scalar value.  */
8717         vec_initial_def
8718           = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
8719                                            initial_def, initial_def);
8720       else
8721         {
8722           if (ncopies == 1)
8723             vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8724           if (!reduc_info->reduc_initial_values.is_empty ())
8725             {
8726               initial_def = reduc_info->reduc_initial_values[0];
8727               code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8728               tree neutral_op
8729                 = neutral_op_for_reduction (TREE_TYPE (initial_def),
8730                                             code, initial_def);
8731               gcc_assert (neutral_op);
8732               /* Try to simplify the vector initialization by applying an
8733                  adjustment after the reduction has been performed.  */
8734               if (!reduc_info->reused_accumulator
8735                   && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8736                   && !operand_equal_p (neutral_op, initial_def))
8737                 {
8738                   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
8739                     = initial_def;
8740                   initial_def = neutral_op;
8741                 }
8742               vec_initial_def
8743                 = get_initial_def_for_reduction (loop_vinfo, reduc_info,
8744                                                  initial_def, neutral_op);
8745             }
8746         }
8747     }
8748
8749   if (vec_initial_def)
8750     {
8751       vec_initial_defs.create (ncopies);
8752       for (i = 0; i < ncopies; ++i)
8753         vec_initial_defs.quick_push (vec_initial_def);
8754     }
8755
8756   if (auto *accumulator = reduc_info->reused_accumulator)
8757     {
8758       tree def = accumulator->reduc_input;
8759       if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8760         {
8761           unsigned int nreduc;
8762           bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
8763                                             (TREE_TYPE (def)),
8764                                           TYPE_VECTOR_SUBPARTS (vectype_out),
8765                                           &nreduc);
8766           gcc_assert (res);
8767           gimple_seq stmts = NULL;
8768           /* Reduce the single vector to a smaller one.  */
8769           if (nreduc != 1)
8770             {
8771               /* Perform the reduction in the appropriate type.  */
8772               tree rvectype = vectype_out;
8773               if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
8774                                               TREE_TYPE (TREE_TYPE (def))))
8775                 rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
8776                                               TYPE_VECTOR_SUBPARTS
8777                                                 (vectype_out));
8778               def = vect_create_partial_epilog (def, rvectype,
8779                                                 STMT_VINFO_REDUC_CODE
8780                                                   (reduc_info),
8781                                                 &stmts);
8782             }
8783           /* The epilogue loop might use a different vector mode, like
8784              VNx2DI vs. V2DI.  */
8785           if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
8786             {
8787               tree reduc_type = build_vector_type_for_mode
8788                 (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
8789               def = gimple_convert (&stmts, reduc_type, def);
8790             }
8791           /* Adjust the input so we pick up the partially reduced value
8792              for the skip edge in vect_create_epilog_for_reduction.  */
8793           accumulator->reduc_input = def;
8794           /* And the reduction could be carried out using a different sign.  */
8795           if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8796             def = gimple_convert (&stmts, vectype_out, def);
8797           if (loop_vinfo->main_loop_edge)
8798             {
8799               /* While we'd like to insert on the edge this will split
8800                  blocks and disturb bookkeeping, we also will eventually
8801                  need this on the skip edge.  Rely on sinking to
8802                  fixup optimal placement and insert in the pred.  */
8803               gimple_stmt_iterator gsi
8804                 = gsi_last_bb (loop_vinfo->main_loop_edge->src);
8805               /* Insert before a cond that eventually skips the
8806                  epilogue.  */
8807               if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
8808                 gsi_prev (&gsi);
8809               gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
8810             }
8811           else
8812             gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
8813                                               stmts);
8814         }
8815       if (loop_vinfo->main_loop_edge)
8816         vec_initial_defs[0]
8817           = vect_get_main_loop_result (loop_vinfo, def,
8818                                        vec_initial_defs[0]);
8819       else
8820         vec_initial_defs.safe_push (def);
8821     }
8822
8823   /* Generate the reduction PHIs upfront.  */
8824   for (i = 0; i < vec_num; i++)
8825     {
8826       tree vec_init_def = vec_initial_defs[i];
8827       for (j = 0; j < ncopies; j++)
8828         {
8829           /* Create the reduction-phi that defines the reduction
8830              operand.  */
8831           gphi *new_phi = create_phi_node (vec_dest, loop->header);
8832
8833           /* Set the loop-entry arg of the reduction-phi.  */
8834           if (j != 0 && nested_cycle)
8835             vec_init_def = vec_initial_defs[j];
8836           add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
8837                        UNKNOWN_LOCATION);
8838
8839           /* The loop-latch arg is set in epilogue processing.  */
8840
8841           if (slp_node)
8842             slp_node->push_vec_def (new_phi);
8843           else
8844             {
8845               if (j == 0)
8846                 *vec_stmt = new_phi;
8847               STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8848             }
8849         }
8850     }
8851
8852   return true;
8853 }
8854
8855 /* Vectorizes LC PHIs.  */
8856
8857 bool
8858 vectorizable_lc_phi (loop_vec_info loop_vinfo,
8859                      stmt_vec_info stmt_info, gimple **vec_stmt,
8860                      slp_tree slp_node)
8861 {
8862   if (!loop_vinfo
8863       || !is_a <gphi *> (stmt_info->stmt)
8864       || gimple_phi_num_args (stmt_info->stmt) != 1)
8865     return false;
8866
8867   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
8868       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
8869     return false;
8870
8871   if (!vec_stmt) /* transformation not required.  */
8872     {
8873       /* Deal with copies from externs or constants that disguise as
8874          loop-closed PHI nodes (PR97886).  */
8875       if (slp_node
8876           && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
8877                                                 SLP_TREE_VECTYPE (slp_node)))
8878         {
8879           if (dump_enabled_p ())
8880             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8881                              "incompatible vector types for invariants\n");
8882           return false;
8883         }
8884       STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
8885       return true;
8886     }
8887
8888   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8889   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8890   basic_block bb = gimple_bb (stmt_info->stmt);
8891   edge e = single_pred_edge (bb);
8892   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8893   auto_vec<tree> vec_oprnds;
8894   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
8895                      !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
8896                      gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
8897   for (unsigned i = 0; i < vec_oprnds.length (); i++)
8898     {
8899       /* Create the vectorized LC PHI node.  */
8900       gphi *new_phi = create_phi_node (vec_dest, bb);
8901       add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
8902       if (slp_node)
8903         slp_node->push_vec_def (new_phi);
8904       else
8905         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8906     }
8907   if (!slp_node)
8908     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8909
8910   return true;
8911 }
8912
8913 /* Vectorizes PHIs.  */
8914
8915 bool
8916 vectorizable_phi (vec_info *,
8917                   stmt_vec_info stmt_info, gimple **vec_stmt,
8918                   slp_tree slp_node, stmt_vector_for_cost *cost_vec)
8919 {
8920   if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
8921     return false;
8922
8923   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
8924     return false;
8925
8926   tree vectype = SLP_TREE_VECTYPE (slp_node);
8927
8928   if (!vec_stmt) /* transformation not required.  */
8929     {
8930       slp_tree child;
8931       unsigned i;
8932       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
8933         if (!child)
8934           {
8935             if (dump_enabled_p ())
8936               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8937                                "PHI node with unvectorized backedge def\n");
8938             return false;
8939           }
8940         else if (!vect_maybe_update_slp_op_vectype (child, vectype))
8941           {
8942             if (dump_enabled_p ())
8943               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8944                                "incompatible vector types for invariants\n");
8945             return false;
8946           }
8947         else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8948                  && !useless_type_conversion_p (vectype,
8949                                                 SLP_TREE_VECTYPE (child)))
8950           {
8951             /* With bools we can have mask and non-mask precision vectors
8952                or different non-mask precisions.  while pattern recog is
8953                supposed to guarantee consistency here bugs in it can cause
8954                mismatches (PR103489 and PR103800 for example).
8955                Deal with them here instead of ICEing later.  */
8956             if (dump_enabled_p ())
8957               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8958                                "incompatible vector type setup from "
8959                                "bool pattern detection\n");
8960             return false;
8961           }
8962
8963       /* For single-argument PHIs assume coalescing which means zero cost
8964          for the scalar and the vector PHIs.  This avoids artificially
8965          favoring the vector path (but may pessimize it in some cases).  */
8966       if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
8967         record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
8968                           vector_stmt, stmt_info, vectype, 0, vect_body);
8969       STMT_VINFO_TYPE (stmt_info) = phi_info_type;
8970       return true;
8971     }
8972
8973   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8974   basic_block bb = gimple_bb (stmt_info->stmt);
8975   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8976   auto_vec<gphi *> new_phis;
8977   for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
8978     {
8979       slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8980
8981       /* Skip not yet vectorized defs.  */
8982       if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8983           && SLP_TREE_VEC_DEFS (child).is_empty ())
8984         continue;
8985
8986       auto_vec<tree> vec_oprnds;
8987       vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
8988       if (!new_phis.exists ())
8989         {
8990           new_phis.create (vec_oprnds.length ());
8991           for (unsigned j = 0; j < vec_oprnds.length (); j++)
8992             {
8993               /* Create the vectorized LC PHI node.  */
8994               new_phis.quick_push (create_phi_node (vec_dest, bb));
8995               slp_node->push_vec_def (new_phis[j]);
8996             }
8997         }
8998       edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
8999       for (unsigned j = 0; j < vec_oprnds.length (); j++)
9000         add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
9001     }
9002   /* We should have at least one already vectorized child.  */
9003   gcc_assert (new_phis.exists ());
9004
9005   return true;
9006 }
9007
9008 /* Vectorizes first order recurrences.  An overview of the transformation
9009    is described below. Suppose we have the following loop.
9010
9011      int t = 0;
9012      for (int i = 0; i < n; ++i)
9013        {
9014          b[i] = a[i] - t;
9015          t = a[i];
9016        }
9017
9018    There is a first-order recurrence on 'a'. For this loop, the scalar IR
9019    looks (simplified) like:
9020
9021     scalar.preheader:
9022       init = 0;
9023
9024     scalar.body:
9025       i = PHI <0(scalar.preheader), i+1(scalar.body)>
9026       _2 = PHI <(init(scalar.preheader), <_1(scalar.body)>
9027       _1 = a[i]
9028       b[i] = _1 - _2
9029       if (i < n) goto scalar.body
9030
9031    In this example, _2 is a recurrence because it's value depends on the
9032    previous iteration.  We vectorize this as (VF = 4)
9033
9034     vector.preheader:
9035       vect_init = vect_cst(..., ..., ..., 0)
9036
9037     vector.body
9038       i = PHI <0(vector.preheader), i+4(vector.body)>
9039       vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)>
9040       vect_2 = a[i, i+1, i+2, i+3];
9041       vect_3 = vec_perm (vect_1, vect_2, { 3, 4, 5, 6 })
9042       b[i, i+1, i+2, i+3] = vect_2 - vect_3
9043       if (..) goto vector.body
9044
9045    In this function, vectorizable_recurr, we code generate both the
9046    vector PHI node and the permute since those together compute the
9047    vectorized value of the scalar PHI.  We do not yet have the
9048    backedge value to fill in there nor into the vec_perm.  Those
9049    are filled in maybe_set_vectorized_backedge_value and
9050    vect_schedule_scc.
9051
9052    TODO:  Since the scalar loop does not have a use of the recurrence
9053    outside of the loop the natural way to implement peeling via
9054    vectorizing the live value doesn't work.  For now peeling of loops
9055    with a recurrence is not implemented.  For SLP the supported cases
9056    are restricted to those requiring a single vector recurrence PHI.  */
9057
9058 bool
9059 vectorizable_recurr (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
9060                      gimple **vec_stmt, slp_tree slp_node,
9061                      stmt_vector_for_cost *cost_vec)
9062 {
9063   if (!loop_vinfo || !is_a<gphi *> (stmt_info->stmt))
9064     return false;
9065
9066   gphi *phi = as_a<gphi *> (stmt_info->stmt);
9067
9068   /* So far we only support first-order recurrence auto-vectorization.  */
9069   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
9070     return false;
9071
9072   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9073   unsigned ncopies;
9074   if (slp_node)
9075     ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
9076   else
9077     ncopies = vect_get_num_copies (loop_vinfo, vectype);
9078   poly_int64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9079   unsigned dist = slp_node ? SLP_TREE_LANES (slp_node) : 1;
9080   /* We need to be able to make progress with a single vector.  */
9081   if (maybe_gt (dist * 2, nunits))
9082     {
9083       if (dump_enabled_p ())
9084         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9085                          "first order recurrence exceeds half of "
9086                          "a vector\n");
9087       return false;
9088     }
9089
9090   /* First-order recurrence autovectorization needs to handle permutation
9091      with indices = [nunits-1, nunits, nunits+1, ...].  */
9092   vec_perm_builder sel (nunits, 1, 3);
9093   for (int i = 0; i < 3; ++i)
9094     sel.quick_push (nunits - dist + i);
9095   vec_perm_indices indices (sel, 2, nunits);
9096
9097   if (!vec_stmt) /* transformation not required.  */
9098     {
9099       if (!can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
9100                                  indices))
9101         return false;
9102
9103       if (slp_node)
9104         {
9105           /* We eventually need to set a vector type on invariant
9106              arguments.  */
9107           unsigned j;
9108           slp_tree child;
9109           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9110             if (!vect_maybe_update_slp_op_vectype
9111                   (child, SLP_TREE_VECTYPE (slp_node)))
9112               {
9113                 if (dump_enabled_p ())
9114                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9115                                    "incompatible vector types for "
9116                                    "invariants\n");
9117                 return false;
9118               }
9119         }
9120       /* The recurrence costs the initialization vector and one permute
9121          for each copy.  */
9122       unsigned prologue_cost = record_stmt_cost (cost_vec, 1, scalar_to_vec,
9123                                                  stmt_info, 0, vect_prologue);
9124       unsigned inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9125                                                stmt_info, 0, vect_body);
9126       if (dump_enabled_p ())
9127         dump_printf_loc (MSG_NOTE, vect_location,
9128                          "vectorizable_recurr: inside_cost = %d, "
9129                          "prologue_cost = %d .\n", inside_cost,
9130                          prologue_cost);
9131
9132       STMT_VINFO_TYPE (stmt_info) = recurr_info_type;
9133       return true;
9134     }
9135
9136   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
9137   basic_block bb = gimple_bb (phi);
9138   tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
9139   if (!useless_type_conversion_p (TREE_TYPE (vectype), TREE_TYPE (preheader)))
9140     {
9141       gimple_seq stmts = NULL;
9142       preheader = gimple_convert (&stmts, TREE_TYPE (vectype), preheader);
9143       gsi_insert_seq_on_edge_immediate (pe, stmts);
9144     }
9145   tree vec_init = build_vector_from_val (vectype, preheader);
9146   vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL);
9147
9148   /* Create the vectorized first-order PHI node.  */
9149   tree vec_dest = vect_get_new_vect_var (vectype,
9150                                          vect_simple_var, "vec_recur_");
9151   gphi *new_phi = create_phi_node (vec_dest, bb);
9152   add_phi_arg (new_phi, vec_init, pe, UNKNOWN_LOCATION);
9153
9154   /* Insert shuffles the first-order recurrence autovectorization.
9155        result = VEC_PERM <vec_recur, vect_1, index[nunits-1, nunits, ...]>.  */
9156   tree perm = vect_gen_perm_mask_checked (vectype, indices);
9157
9158   /* Insert the required permute after the latch definition.  The
9159      second and later operands are tentative and will be updated when we have
9160      vectorized the latch definition.  */
9161   edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
9162   gimple *latch_def = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, le));
9163   gimple_stmt_iterator gsi2 = gsi_for_stmt (latch_def);
9164   gsi_next (&gsi2);
9165
9166   for (unsigned i = 0; i < ncopies; ++i)
9167     {
9168       vec_dest = make_ssa_name (vectype);
9169       gassign *vperm
9170           = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
9171                                  i == 0 ? gimple_phi_result (new_phi) : NULL,
9172                                  NULL, perm);
9173       vect_finish_stmt_generation (loop_vinfo, stmt_info, vperm, &gsi2);
9174
9175       if (slp_node)
9176         slp_node->push_vec_def (vperm);
9177       else
9178         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (vperm);
9179     }
9180
9181   if (!slp_node)
9182     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
9183   return true;
9184 }
9185
9186 /* Return true if VECTYPE represents a vector that requires lowering
9187    by the vector lowering pass.  */
9188
9189 bool
9190 vect_emulated_vector_p (tree vectype)
9191 {
9192   return (!VECTOR_MODE_P (TYPE_MODE (vectype))
9193           && (!VECTOR_BOOLEAN_TYPE_P (vectype)
9194               || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
9195 }
9196
9197 /* Return true if we can emulate CODE on an integer mode representation
9198    of a vector.  */
9199
9200 bool
9201 vect_can_vectorize_without_simd_p (tree_code code)
9202 {
9203   switch (code)
9204     {
9205     case PLUS_EXPR:
9206     case MINUS_EXPR:
9207     case NEGATE_EXPR:
9208     case BIT_AND_EXPR:
9209     case BIT_IOR_EXPR:
9210     case BIT_XOR_EXPR:
9211     case BIT_NOT_EXPR:
9212       return true;
9213
9214     default:
9215       return false;
9216     }
9217 }
9218
9219 /* Likewise, but taking a code_helper.  */
9220
9221 bool
9222 vect_can_vectorize_without_simd_p (code_helper code)
9223 {
9224   return (code.is_tree_code ()
9225           && vect_can_vectorize_without_simd_p (tree_code (code)));
9226 }
9227
9228 /* Create vector init for vectorized iv.  */
9229 static tree
9230 vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9231                                tree step_expr, poly_uint64 nunits,
9232                                tree vectype,
9233                                enum vect_induction_op_type induction_type)
9234 {
9235   unsigned HOST_WIDE_INT const_nunits;
9236   tree vec_shift, vec_init, new_name;
9237   unsigned i;
9238   tree itype = TREE_TYPE (vectype);
9239
9240   /* iv_loop is the loop to be vectorized. Create:
9241      vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr).  */
9242   new_name = gimple_convert (stmts, itype, init_expr);
9243   switch (induction_type)
9244     {
9245     case vect_step_op_shr:
9246     case vect_step_op_shl:
9247       /* Build the Initial value from shift_expr.  */
9248       vec_init = gimple_build_vector_from_val (stmts,
9249                                                vectype,
9250                                                new_name);
9251       vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
9252                                 build_zero_cst (itype), step_expr);
9253       vec_init = gimple_build (stmts,
9254                                (induction_type == vect_step_op_shr
9255                                 ? RSHIFT_EXPR : LSHIFT_EXPR),
9256                                vectype, vec_init, vec_shift);
9257       break;
9258
9259     case vect_step_op_neg:
9260       {
9261         vec_init = gimple_build_vector_from_val (stmts,
9262                                                  vectype,
9263                                                  new_name);
9264         tree vec_neg = gimple_build (stmts, NEGATE_EXPR,
9265                                      vectype, vec_init);
9266         /* The encoding has 2 interleaved stepped patterns.  */
9267         vec_perm_builder sel (nunits, 2, 3);
9268         sel.quick_grow (6);
9269         for (i = 0; i < 3; i++)
9270           {
9271             sel[2 * i] = i;
9272             sel[2 * i + 1] = i + nunits;
9273           }
9274         vec_perm_indices indices (sel, 2, nunits);
9275         /* Don't use vect_gen_perm_mask_checked since can_vec_perm_const_p may
9276            fail when vec_init is const vector. In that situation vec_perm is not
9277            really needed.  */
9278         tree perm_mask_even
9279           = vect_gen_perm_mask_any (vectype, indices);
9280         vec_init = gimple_build (stmts, VEC_PERM_EXPR,
9281                                  vectype,
9282                                  vec_init, vec_neg,
9283                                  perm_mask_even);
9284       }
9285       break;
9286
9287     case vect_step_op_mul:
9288       {
9289         /* Use unsigned mult to avoid UD integer overflow.  */
9290         gcc_assert (nunits.is_constant (&const_nunits));
9291         tree utype = unsigned_type_for (itype);
9292         tree uvectype = build_vector_type (utype,
9293                                            TYPE_VECTOR_SUBPARTS (vectype));
9294         new_name = gimple_convert (stmts, utype, new_name);
9295         vec_init = gimple_build_vector_from_val (stmts,
9296                                                  uvectype,
9297                                                  new_name);
9298         tree_vector_builder elts (uvectype, const_nunits, 1);
9299         tree elt_step = build_one_cst (utype);
9300
9301         elts.quick_push (elt_step);
9302         for (i = 1; i < const_nunits; i++)
9303           {
9304             /* Create: new_name_i = new_name + step_expr.  */
9305             elt_step = gimple_build (stmts, MULT_EXPR,
9306                                      utype, elt_step, step_expr);
9307             elts.quick_push (elt_step);
9308           }
9309         /* Create a vector from [new_name_0, new_name_1, ...,
9310            new_name_nunits-1].  */
9311         tree vec_mul = gimple_build_vector (stmts, &elts);
9312         vec_init = gimple_build (stmts, MULT_EXPR, uvectype,
9313                                  vec_init, vec_mul);
9314         vec_init = gimple_convert (stmts, vectype, vec_init);
9315       }
9316       break;
9317
9318     default:
9319       gcc_unreachable ();
9320     }
9321
9322   return vec_init;
9323 }
9324
9325 /* Peel init_expr by skip_niter for induction_type.  */
9326 tree
9327 vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9328                              tree skip_niters, tree step_expr,
9329                              enum vect_induction_op_type induction_type)
9330 {
9331   gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST);
9332   tree type = TREE_TYPE (init_expr);
9333   unsigned prec = TYPE_PRECISION (type);
9334   switch (induction_type)
9335     {
9336     case vect_step_op_neg:
9337       if (TREE_INT_CST_LOW (skip_niters) % 2)
9338         init_expr = gimple_build (stmts, NEGATE_EXPR, type, init_expr);
9339       /* else no change.  */
9340       break;
9341
9342     case vect_step_op_shr:
9343     case vect_step_op_shl:
9344       skip_niters = gimple_convert (stmts, type, skip_niters);
9345       step_expr = gimple_build (stmts, MULT_EXPR, type, step_expr, skip_niters);
9346       /* When shift mount >= precision, need to avoid UD.
9347          In the original loop, there's no UD, and according to semantic,
9348          init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr.  */
9349       if (!tree_fits_uhwi_p (step_expr)
9350           || tree_to_uhwi (step_expr) >= prec)
9351         {
9352           if (induction_type == vect_step_op_shl
9353               || TYPE_UNSIGNED (type))
9354             init_expr = build_zero_cst (type);
9355           else
9356             init_expr = gimple_build (stmts, RSHIFT_EXPR, type,
9357                                       init_expr,
9358                                       wide_int_to_tree (type, prec - 1));
9359         }
9360       else
9361         init_expr = gimple_build (stmts, (induction_type == vect_step_op_shr
9362                                           ? RSHIFT_EXPR : LSHIFT_EXPR),
9363                                   type, init_expr, step_expr);
9364       break;
9365
9366     case vect_step_op_mul:
9367       {
9368         tree utype = unsigned_type_for (type);
9369         init_expr = gimple_convert (stmts, utype, init_expr);
9370         wide_int skipn = wi::to_wide (skip_niters);
9371         wide_int begin = wi::to_wide (step_expr);
9372         auto_mpz base, exp, mod, res;
9373         wi::to_mpz (begin, base, TYPE_SIGN (type));
9374         wi::to_mpz (skipn, exp, UNSIGNED);
9375         mpz_ui_pow_ui (mod, 2, TYPE_PRECISION (type));
9376         mpz_powm (res, base, exp, mod);
9377         begin = wi::from_mpz (type, res, TYPE_SIGN (type));
9378         tree mult_expr = wide_int_to_tree (utype, begin);
9379         init_expr = gimple_build (stmts, MULT_EXPR, utype,
9380                                   init_expr, mult_expr);
9381         init_expr = gimple_convert (stmts, type, init_expr);
9382       }
9383       break;
9384
9385     default:
9386       gcc_unreachable ();
9387     }
9388
9389   return init_expr;
9390 }
9391
9392 /* Create vector step for vectorized iv.  */
9393 static tree
9394 vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
9395                                poly_uint64 vf,
9396                                enum vect_induction_op_type induction_type)
9397 {
9398   tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
9399   tree new_name = NULL;
9400   /* Step should be pow (step, vf) for mult induction.  */
9401   if (induction_type == vect_step_op_mul)
9402     {
9403       gcc_assert (vf.is_constant ());
9404       wide_int begin = wi::to_wide (step_expr);
9405
9406       for (unsigned i = 0; i != vf.to_constant () - 1; i++)
9407         begin = wi::mul (begin, wi::to_wide (step_expr));
9408
9409       new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin);
9410     }
9411   else if (induction_type == vect_step_op_neg)
9412     /* Do nothing.  */
9413     ;
9414   else
9415     new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr),
9416                              expr, step_expr);
9417   return new_name;
9418 }
9419
9420 static tree
9421 vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
9422                                    stmt_vec_info stmt_info,
9423                                    tree new_name, tree vectype,
9424                                    enum vect_induction_op_type induction_type)
9425 {
9426   /* No step is needed for neg induction.  */
9427   if (induction_type == vect_step_op_neg)
9428     return NULL;
9429
9430   tree t = unshare_expr (new_name);
9431   gcc_assert (CONSTANT_CLASS_P (new_name)
9432               || TREE_CODE (new_name) == SSA_NAME);
9433   tree new_vec = build_vector_from_val (vectype, t);
9434   tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
9435                                     new_vec, vectype, NULL);
9436   return vec_step;
9437 }
9438
9439 /* Update vectorized iv with vect_step, induc_def is init.  */
9440 static tree
9441 vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
9442                           tree induc_def, tree vec_step,
9443                           enum vect_induction_op_type induction_type)
9444 {
9445   tree vec_def = induc_def;
9446   switch (induction_type)
9447     {
9448     case vect_step_op_mul:
9449       {
9450         /* Use unsigned mult to avoid UD integer overflow.  */
9451         tree uvectype
9452           = build_vector_type (unsigned_type_for (TREE_TYPE (vectype)),
9453                                TYPE_VECTOR_SUBPARTS (vectype));
9454         vec_def = gimple_convert (stmts, uvectype, vec_def);
9455         vec_step = gimple_convert (stmts, uvectype, vec_step);
9456         vec_def = gimple_build (stmts, MULT_EXPR, uvectype,
9457                                 vec_def, vec_step);
9458         vec_def = gimple_convert (stmts, vectype, vec_def);
9459       }
9460       break;
9461
9462     case vect_step_op_shr:
9463       vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype,
9464                               vec_def, vec_step);
9465       break;
9466
9467     case vect_step_op_shl:
9468       vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype,
9469                               vec_def, vec_step);
9470       break;
9471     case vect_step_op_neg:
9472       vec_def = induc_def;
9473       /* Do nothing.  */
9474       break;
9475     default:
9476       gcc_unreachable ();
9477     }
9478
9479   return vec_def;
9480
9481 }
9482
9483 /* Function vectorizable_induction
9484
9485    Check if STMT_INFO performs an nonlinear induction computation that can be
9486    vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
9487    a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
9488    basic block.
9489    Return true if STMT_INFO is vectorizable in this way.  */
9490
9491 static bool
9492 vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
9493                                   stmt_vec_info stmt_info,
9494                                   gimple **vec_stmt, slp_tree slp_node,
9495                                   stmt_vector_for_cost *cost_vec)
9496 {
9497   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9498   unsigned ncopies;
9499   bool nested_in_vect_loop = false;
9500   class loop *iv_loop;
9501   tree vec_def;
9502   edge pe = loop_preheader_edge (loop);
9503   basic_block new_bb;
9504   tree vec_init, vec_step;
9505   tree new_name;
9506   gimple *new_stmt;
9507   gphi *induction_phi;
9508   tree induc_def, vec_dest;
9509   tree init_expr, step_expr;
9510   tree niters_skip;
9511   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9512   unsigned i;
9513   gimple_stmt_iterator si;
9514
9515   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9516
9517   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9518   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9519   enum vect_induction_op_type induction_type
9520     = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9521
9522   gcc_assert (induction_type > vect_step_op_add);
9523
9524   if (slp_node)
9525     ncopies = 1;
9526   else
9527     ncopies = vect_get_num_copies (loop_vinfo, vectype);
9528   gcc_assert (ncopies >= 1);
9529
9530   /* FORNOW. Only handle nonlinear induction in the same loop.  */
9531   if (nested_in_vect_loop_p (loop, stmt_info))
9532     {
9533       if (dump_enabled_p ())
9534         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9535                          "nonlinear induction in nested loop.\n");
9536       return false;
9537     }
9538
9539   iv_loop = loop;
9540   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9541
9542   /* TODO: Support slp for nonlinear iv. There should be separate vector iv
9543      update for each iv and a permutation to generate wanted vector iv.  */
9544   if (slp_node)
9545     {
9546       if (dump_enabled_p ())
9547         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9548                          "SLP induction not supported for nonlinear"
9549                          " induction.\n");
9550       return false;
9551     }
9552
9553   if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
9554     {
9555       if (dump_enabled_p ())
9556         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9557                          "floating point nonlinear induction vectorization"
9558                          " not supported.\n");
9559       return false;
9560     }
9561
9562   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9563   init_expr = vect_phi_initial_value (phi);
9564   gcc_assert (step_expr != NULL_TREE && init_expr != NULL
9565               && TREE_CODE (step_expr) == INTEGER_CST);
9566   /* step_expr should be aligned with init_expr,
9567      .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used.  */
9568   step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
9569
9570   if (TREE_CODE (init_expr) == INTEGER_CST)
9571     init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
9572   else
9573     gcc_assert (tree_nop_conversion_p (TREE_TYPE (vectype),
9574                                        TREE_TYPE (init_expr)));
9575
9576   switch (induction_type)
9577     {
9578     case vect_step_op_neg:
9579       if (TREE_CODE (init_expr) != INTEGER_CST
9580           && TREE_CODE (init_expr) != REAL_CST)
9581         {
9582           /* Check for backend support of NEGATE_EXPR and vec_perm.  */
9583           if (!directly_supported_p (NEGATE_EXPR, vectype))
9584             return false;
9585
9586           /* The encoding has 2 interleaved stepped patterns.  */
9587           vec_perm_builder sel (nunits, 2, 3);
9588           machine_mode mode = TYPE_MODE (vectype);
9589           sel.quick_grow (6);
9590           for (i = 0; i < 3; i++)
9591             {
9592               sel[i * 2] = i;
9593               sel[i * 2 + 1] = i + nunits;
9594             }
9595           vec_perm_indices indices (sel, 2, nunits);
9596           if (!can_vec_perm_const_p (mode, mode, indices))
9597             return false;
9598         }
9599       break;
9600
9601     case vect_step_op_mul:
9602       {
9603         /* Check for backend support of MULT_EXPR.  */
9604         if (!directly_supported_p (MULT_EXPR, vectype))
9605           return false;
9606
9607         /* ?? How to construct vector step for variable number vector.
9608            [ 1, step, pow (step, 2), pow (step, 4), .. ].  */
9609         if (!vf.is_constant ())
9610           return false;
9611       }
9612       break;
9613
9614     case vect_step_op_shr:
9615       /* Check for backend support of RSHIFT_EXPR.  */
9616       if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
9617         return false;
9618
9619       /* Don't shift more than type precision to avoid UD.  */
9620       if (!tree_fits_uhwi_p (step_expr)
9621           || maybe_ge (nunits * tree_to_uhwi (step_expr),
9622                        TYPE_PRECISION (TREE_TYPE (init_expr))))
9623         return false;
9624       break;
9625
9626     case vect_step_op_shl:
9627       /* Check for backend support of RSHIFT_EXPR.  */
9628       if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
9629         return false;
9630
9631       /* Don't shift more than type precision to avoid UD.  */
9632       if (!tree_fits_uhwi_p (step_expr)
9633           || maybe_ge (nunits * tree_to_uhwi (step_expr),
9634                        TYPE_PRECISION (TREE_TYPE (init_expr))))
9635         return false;
9636
9637       break;
9638
9639     default:
9640       gcc_unreachable ();
9641     }
9642
9643   if (!vec_stmt) /* transformation not required.  */
9644     {
9645       unsigned inside_cost = 0, prologue_cost = 0;
9646       /* loop cost for vec_loop. Neg induction doesn't have any
9647          inside_cost.  */
9648       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9649                                       stmt_info, 0, vect_body);
9650
9651       /* loop cost for vec_loop. Neg induction doesn't have any
9652          inside_cost.  */
9653       if (induction_type == vect_step_op_neg)
9654         inside_cost = 0;
9655
9656       /* prologue cost for vec_init and vec_step.  */
9657       prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9658                                         stmt_info, 0, vect_prologue);
9659
9660       if (dump_enabled_p ())
9661         dump_printf_loc (MSG_NOTE, vect_location,
9662                          "vect_model_induction_cost: inside_cost = %d, "
9663                          "prologue_cost = %d. \n", inside_cost,
9664                          prologue_cost);
9665
9666       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9667       DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
9668       return true;
9669     }
9670
9671   /* Transform.  */
9672
9673   /* Compute a vector variable, initialized with the first VF values of
9674      the induction variable.  E.g., for an iv with IV_PHI='X' and
9675      evolution S, for a vector of 4 units, we want to compute:
9676      [X, X + S, X + 2*S, X + 3*S].  */
9677
9678   if (dump_enabled_p ())
9679     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9680
9681   pe = loop_preheader_edge (iv_loop);
9682   /* Find the first insertion point in the BB.  */
9683   basic_block bb = gimple_bb (phi);
9684   si = gsi_after_labels (bb);
9685
9686   gimple_seq stmts = NULL;
9687
9688   niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9689   /* If we are using the loop mask to "peel" for alignment then we need
9690      to adjust the start value here.  */
9691   if (niters_skip != NULL_TREE)
9692     init_expr = vect_peel_nonlinear_iv_init (&stmts, init_expr, niters_skip,
9693                                              step_expr, induction_type);
9694
9695   vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr,
9696                                             step_expr, nunits, vectype,
9697                                             induction_type);
9698   if (stmts)
9699     {
9700       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9701       gcc_assert (!new_bb);
9702     }
9703
9704   stmts = NULL;
9705   new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9706                                             vf, induction_type);
9707   if (stmts)
9708     {
9709       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9710       gcc_assert (!new_bb);
9711     }
9712
9713   vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9714                                                 new_name, vectype,
9715                                                 induction_type);
9716   /* Create the following def-use cycle:
9717      loop prolog:
9718      vec_init = ...
9719      vec_step = ...
9720      loop:
9721      vec_iv = PHI <vec_init, vec_loop>
9722      ...
9723      STMT
9724      ...
9725      vec_loop = vec_iv + vec_step;  */
9726
9727   /* Create the induction-phi that defines the induction-operand.  */
9728   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
9729   induction_phi = create_phi_node (vec_dest, iv_loop->header);
9730   induc_def = PHI_RESULT (induction_phi);
9731
9732   /* Create the iv update inside the loop.  */
9733   stmts = NULL;
9734   vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9735                                       induc_def, vec_step,
9736                                       induction_type);
9737
9738   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9739   new_stmt = SSA_NAME_DEF_STMT (vec_def);
9740
9741   /* Set the arguments of the phi node:  */
9742   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9743   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9744                UNKNOWN_LOCATION);
9745
9746   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
9747   *vec_stmt = induction_phi;
9748
9749   /* In case that vectorization factor (VF) is bigger than the number
9750      of elements that we can fit in a vectype (nunits), we have to generate
9751      more than one vector stmt - i.e - we need to "unroll" the
9752      vector stmt by a factor VF/nunits.  For more details see documentation
9753      in vectorizable_operation.  */
9754
9755   if (ncopies > 1)
9756     {
9757       stmts = NULL;
9758       /* FORNOW. This restriction should be relaxed.  */
9759       gcc_assert (!nested_in_vect_loop);
9760
9761       new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9762                                                 nunits, induction_type);
9763
9764       vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9765                                                     new_name, vectype,
9766                                                     induction_type);
9767       vec_def = induc_def;
9768       for (i = 1; i < ncopies; i++)
9769         {
9770           /* vec_i = vec_prev + vec_step.  */
9771           stmts = NULL;
9772           vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9773                                               vec_def, vec_step,
9774                                               induction_type);
9775           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9776           new_stmt = SSA_NAME_DEF_STMT (vec_def);
9777           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
9778         }
9779     }
9780
9781   if (dump_enabled_p ())
9782     dump_printf_loc (MSG_NOTE, vect_location,
9783                      "transform induction: created def-use cycle: %G%G",
9784                      (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
9785
9786   return true;
9787 }
9788
9789 /* Function vectorizable_induction
9790
9791    Check if STMT_INFO performs an induction computation that can be vectorized.
9792    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
9793    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
9794    Return true if STMT_INFO is vectorizable in this way.  */
9795
9796 bool
9797 vectorizable_induction (loop_vec_info loop_vinfo,
9798                         stmt_vec_info stmt_info,
9799                         gimple **vec_stmt, slp_tree slp_node,
9800                         stmt_vector_for_cost *cost_vec)
9801 {
9802   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9803   unsigned ncopies;
9804   bool nested_in_vect_loop = false;
9805   class loop *iv_loop;
9806   tree vec_def;
9807   edge pe = loop_preheader_edge (loop);
9808   basic_block new_bb;
9809   tree new_vec, vec_init, vec_step, t;
9810   tree new_name;
9811   gimple *new_stmt;
9812   gphi *induction_phi;
9813   tree induc_def, vec_dest;
9814   tree init_expr, step_expr;
9815   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9816   unsigned i;
9817   tree expr;
9818   gimple_stmt_iterator si;
9819   enum vect_induction_op_type induction_type
9820     = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9821
9822   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9823   if (!phi)
9824     return false;
9825
9826   if (!STMT_VINFO_RELEVANT_P (stmt_info))
9827     return false;
9828
9829   /* Make sure it was recognized as induction computation.  */
9830   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
9831     return false;
9832
9833   /* Handle nonlinear induction in a separate place.  */
9834   if (induction_type != vect_step_op_add)
9835     return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
9836                                              vec_stmt, slp_node, cost_vec);
9837
9838   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9839   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9840
9841   if (slp_node)
9842     ncopies = 1;
9843   else
9844     ncopies = vect_get_num_copies (loop_vinfo, vectype);
9845   gcc_assert (ncopies >= 1);
9846
9847   /* FORNOW. These restrictions should be relaxed.  */
9848   if (nested_in_vect_loop_p (loop, stmt_info))
9849     {
9850       imm_use_iterator imm_iter;
9851       use_operand_p use_p;
9852       gimple *exit_phi;
9853       edge latch_e;
9854       tree loop_arg;
9855
9856       if (ncopies > 1)
9857         {
9858           if (dump_enabled_p ())
9859             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9860                              "multiple types in nested loop.\n");
9861           return false;
9862         }
9863
9864       exit_phi = NULL;
9865       latch_e = loop_latch_edge (loop->inner);
9866       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
9867       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
9868         {
9869           gimple *use_stmt = USE_STMT (use_p);
9870           if (is_gimple_debug (use_stmt))
9871             continue;
9872
9873           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
9874             {
9875               exit_phi = use_stmt;
9876               break;
9877             }
9878         }
9879       if (exit_phi)
9880         {
9881           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
9882           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
9883                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
9884             {
9885               if (dump_enabled_p ())
9886                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9887                                  "inner-loop induction only used outside "
9888                                  "of the outer vectorized loop.\n");
9889               return false;
9890             }
9891         }
9892
9893       nested_in_vect_loop = true;
9894       iv_loop = loop->inner;
9895     }
9896   else
9897     iv_loop = loop;
9898   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9899
9900   if (slp_node && !nunits.is_constant ())
9901     {
9902       /* The current SLP code creates the step value element-by-element.  */
9903       if (dump_enabled_p ())
9904         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9905                          "SLP induction not supported for variable-length"
9906                          " vectors.\n");
9907       return false;
9908     }
9909
9910   if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
9911     {
9912       if (dump_enabled_p ())
9913         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9914                          "floating point induction vectorization disabled\n");
9915       return false;
9916     }
9917
9918   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9919   gcc_assert (step_expr != NULL_TREE);
9920   tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
9921
9922   /* Check for backend support of PLUS/MINUS_EXPR. */
9923   if (!directly_supported_p (PLUS_EXPR, step_vectype)
9924       || !directly_supported_p (MINUS_EXPR, step_vectype))
9925     return false;
9926
9927   if (!vec_stmt) /* transformation not required.  */
9928     {
9929       unsigned inside_cost = 0, prologue_cost = 0;
9930       if (slp_node)
9931         {
9932           /* We eventually need to set a vector type on invariant
9933              arguments.  */
9934           unsigned j;
9935           slp_tree child;
9936           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9937             if (!vect_maybe_update_slp_op_vectype
9938                 (child, SLP_TREE_VECTYPE (slp_node)))
9939               {
9940                 if (dump_enabled_p ())
9941                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9942                                    "incompatible vector types for "
9943                                    "invariants\n");
9944                 return false;
9945               }
9946           /* loop cost for vec_loop.  */
9947           inside_cost
9948             = record_stmt_cost (cost_vec,
9949                                 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
9950                                 vector_stmt, stmt_info, 0, vect_body);
9951           /* prologue cost for vec_init (if not nested) and step.  */
9952           prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
9953                                             scalar_to_vec,
9954                                             stmt_info, 0, vect_prologue);
9955         }
9956       else /* if (!slp_node) */
9957         {
9958           /* loop cost for vec_loop.  */
9959           inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9960                                           stmt_info, 0, vect_body);
9961           /* prologue cost for vec_init and vec_step.  */
9962           prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9963                                             stmt_info, 0, vect_prologue);
9964         }
9965       if (dump_enabled_p ())
9966         dump_printf_loc (MSG_NOTE, vect_location,
9967                          "vect_model_induction_cost: inside_cost = %d, "
9968                          "prologue_cost = %d .\n", inside_cost,
9969                          prologue_cost);
9970
9971       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9972       DUMP_VECT_SCOPE ("vectorizable_induction");
9973       return true;
9974     }
9975
9976   /* Transform.  */
9977
9978   /* Compute a vector variable, initialized with the first VF values of
9979      the induction variable.  E.g., for an iv with IV_PHI='X' and
9980      evolution S, for a vector of 4 units, we want to compute:
9981      [X, X + S, X + 2*S, X + 3*S].  */
9982
9983   if (dump_enabled_p ())
9984     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9985
9986   pe = loop_preheader_edge (iv_loop);
9987   /* Find the first insertion point in the BB.  */
9988   basic_block bb = gimple_bb (phi);
9989   si = gsi_after_labels (bb);
9990
9991   /* For SLP induction we have to generate several IVs as for example
9992      with group size 3 we need
9993        [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
9994        [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2].  */
9995   if (slp_node)
9996     {
9997       /* Enforced above.  */
9998       unsigned int const_nunits = nunits.to_constant ();
9999
10000       /* The initial values are vectorized, but any lanes > group_size
10001          need adjustment.  */
10002       slp_tree init_node
10003         = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
10004
10005       /* Gather steps.  Since we do not vectorize inductions as
10006          cycles we have to reconstruct the step from SCEV data.  */
10007       unsigned group_size = SLP_TREE_LANES (slp_node);
10008       tree *steps = XALLOCAVEC (tree, group_size);
10009       tree *inits = XALLOCAVEC (tree, group_size);
10010       stmt_vec_info phi_info;
10011       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
10012         {
10013           steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
10014           if (!init_node)
10015             inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
10016                                            pe->dest_idx);
10017         }
10018
10019       /* Now generate the IVs.  */
10020       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10021       gcc_assert ((const_nunits * nvects) % group_size == 0);
10022       unsigned nivs;
10023       if (nested_in_vect_loop)
10024         nivs = nvects;
10025       else
10026         {
10027           /* Compute the number of distinct IVs we need.  First reduce
10028              group_size if it is a multiple of const_nunits so we get
10029              one IV for a group_size of 4 but const_nunits 2.  */
10030           unsigned group_sizep = group_size;
10031           if (group_sizep % const_nunits == 0)
10032             group_sizep = group_sizep / const_nunits;
10033           nivs = least_common_multiple (group_sizep,
10034                                         const_nunits) / const_nunits;
10035         }
10036       tree stept = TREE_TYPE (step_vectype);
10037       tree lupdate_mul = NULL_TREE;
10038       if (!nested_in_vect_loop)
10039         {
10040           /* The number of iterations covered in one vector iteration.  */
10041           unsigned lup_mul = (nvects * const_nunits) / group_size;
10042           lupdate_mul
10043             = build_vector_from_val (step_vectype,
10044                                      SCALAR_FLOAT_TYPE_P (stept)
10045                                      ? build_real_from_wide (stept, lup_mul,
10046                                                              UNSIGNED)
10047                                      : build_int_cstu (stept, lup_mul));
10048         }
10049       tree peel_mul = NULL_TREE;
10050       gimple_seq init_stmts = NULL;
10051       if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
10052         {
10053           if (SCALAR_FLOAT_TYPE_P (stept))
10054             peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
10055                                      LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
10056           else
10057             peel_mul = gimple_convert (&init_stmts, stept,
10058                                        LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
10059           peel_mul = gimple_build_vector_from_val (&init_stmts,
10060                                                    step_vectype, peel_mul);
10061         }
10062       unsigned ivn;
10063       auto_vec<tree> vec_steps;
10064       for (ivn = 0; ivn < nivs; ++ivn)
10065         {
10066           tree_vector_builder step_elts (step_vectype, const_nunits, 1);
10067           tree_vector_builder init_elts (vectype, const_nunits, 1);
10068           tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
10069           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
10070             {
10071               /* The scalar steps of the IVs.  */
10072               tree elt = steps[(ivn*const_nunits + eltn) % group_size];
10073               elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
10074               step_elts.quick_push (elt);
10075               if (!init_node)
10076                 {
10077                   /* The scalar inits of the IVs if not vectorized.  */
10078                   elt = inits[(ivn*const_nunits + eltn) % group_size];
10079                   if (!useless_type_conversion_p (TREE_TYPE (vectype),
10080                                                   TREE_TYPE (elt)))
10081                     elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
10082                                         TREE_TYPE (vectype), elt);
10083                   init_elts.quick_push (elt);
10084                 }
10085               /* The number of steps to add to the initial values.  */
10086               unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
10087               mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
10088                                    ? build_real_from_wide (stept,
10089                                                            mul_elt, UNSIGNED)
10090                                    : build_int_cstu (stept, mul_elt));
10091             }
10092           vec_step = gimple_build_vector (&init_stmts, &step_elts);
10093           vec_steps.safe_push (vec_step);
10094           tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
10095           if (peel_mul)
10096             step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
10097                                      step_mul, peel_mul);
10098           if (!init_node)
10099             vec_init = gimple_build_vector (&init_stmts, &init_elts);
10100
10101           /* Create the induction-phi that defines the induction-operand.  */
10102           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
10103                                             "vec_iv_");
10104           induction_phi = create_phi_node (vec_dest, iv_loop->header);
10105           induc_def = PHI_RESULT (induction_phi);
10106
10107           /* Create the iv update inside the loop  */
10108           tree up = vec_step;
10109           if (lupdate_mul)
10110             up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10111                                vec_step, lupdate_mul);
10112           gimple_seq stmts = NULL;
10113           vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10114           vec_def = gimple_build (&stmts,
10115                                   PLUS_EXPR, step_vectype, vec_def, up);
10116           vec_def = gimple_convert (&stmts, vectype, vec_def);
10117           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10118           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10119                        UNKNOWN_LOCATION);
10120
10121           if (init_node)
10122             vec_init = vect_get_slp_vect_def (init_node, ivn);
10123           if (!nested_in_vect_loop
10124               && !integer_zerop (step_mul))
10125             {
10126               vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
10127               up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10128                                  vec_step, step_mul);
10129               vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
10130                                       vec_def, up);
10131               vec_init = gimple_convert (&init_stmts, vectype, vec_def);
10132             }
10133
10134           /* Set the arguments of the phi node:  */
10135           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10136
10137           slp_node->push_vec_def (induction_phi);
10138         }
10139       if (!nested_in_vect_loop)
10140         {
10141           /* Fill up to the number of vectors we need for the whole group.  */
10142           nivs = least_common_multiple (group_size,
10143                                         const_nunits) / const_nunits;
10144           vec_steps.reserve (nivs-ivn);
10145           for (; ivn < nivs; ++ivn)
10146             {
10147               slp_node->push_vec_def (SLP_TREE_VEC_DEFS (slp_node)[0]);
10148               vec_steps.quick_push (vec_steps[0]);
10149             }
10150         }
10151
10152       /* Re-use IVs when we can.  We are generating further vector
10153          stmts by adding VF' * stride to the IVs generated above.  */
10154       if (ivn < nvects)
10155         {
10156           unsigned vfp
10157             = least_common_multiple (group_size, const_nunits) / group_size;
10158           tree lupdate_mul
10159             = build_vector_from_val (step_vectype,
10160                                      SCALAR_FLOAT_TYPE_P (stept)
10161                                      ? build_real_from_wide (stept,
10162                                                              vfp, UNSIGNED)
10163                                      : build_int_cstu (stept, vfp));
10164           for (; ivn < nvects; ++ivn)
10165             {
10166               gimple *iv
10167                 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (slp_node)[ivn - nivs]);
10168               tree def = gimple_get_lhs (iv);
10169               if (ivn < 2*nivs)
10170                 vec_steps[ivn - nivs]
10171                   = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10172                                   vec_steps[ivn - nivs], lupdate_mul);
10173               gimple_seq stmts = NULL;
10174               def = gimple_convert (&stmts, step_vectype, def);
10175               def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10176                                   def, vec_steps[ivn % nivs]);
10177               def = gimple_convert (&stmts, vectype, def);
10178               if (gimple_code (iv) == GIMPLE_PHI)
10179                 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10180               else
10181                 {
10182                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
10183                   gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
10184                 }
10185               slp_node->push_vec_def (def);
10186             }
10187         }
10188
10189       new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
10190       gcc_assert (!new_bb);
10191
10192       return true;
10193     }
10194
10195   init_expr = vect_phi_initial_value (phi);
10196
10197   gimple_seq stmts = NULL;
10198   if (!nested_in_vect_loop)
10199     {
10200       /* Convert the initial value to the IV update type.  */
10201       tree new_type = TREE_TYPE (step_expr);
10202       init_expr = gimple_convert (&stmts, new_type, init_expr);
10203
10204       /* If we are using the loop mask to "peel" for alignment then we need
10205          to adjust the start value here.  */
10206       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
10207       if (skip_niters != NULL_TREE)
10208         {
10209           if (FLOAT_TYPE_P (vectype))
10210             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
10211                                         skip_niters);
10212           else
10213             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
10214           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
10215                                          skip_niters, step_expr);
10216           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
10217                                     init_expr, skip_step);
10218         }
10219     }
10220
10221   if (stmts)
10222     {
10223       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10224       gcc_assert (!new_bb);
10225     }
10226
10227   /* Create the vector that holds the initial_value of the induction.  */
10228   if (nested_in_vect_loop)
10229     {
10230       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
10231          been created during vectorization of previous stmts.  We obtain it
10232          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
10233       auto_vec<tree> vec_inits;
10234       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
10235                                      init_expr, &vec_inits);
10236       vec_init = vec_inits[0];
10237       /* If the initial value is not of proper type, convert it.  */
10238       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
10239         {
10240           new_stmt
10241             = gimple_build_assign (vect_get_new_ssa_name (vectype,
10242                                                           vect_simple_var,
10243                                                           "vec_iv_"),
10244                                    VIEW_CONVERT_EXPR,
10245                                    build1 (VIEW_CONVERT_EXPR, vectype,
10246                                            vec_init));
10247           vec_init = gimple_assign_lhs (new_stmt);
10248           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
10249                                                  new_stmt);
10250           gcc_assert (!new_bb);
10251         }
10252     }
10253   else
10254     {
10255       /* iv_loop is the loop to be vectorized. Create:
10256          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
10257       stmts = NULL;
10258       new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
10259
10260       unsigned HOST_WIDE_INT const_nunits;
10261       if (nunits.is_constant (&const_nunits))
10262         {
10263           tree_vector_builder elts (step_vectype, const_nunits, 1);
10264           elts.quick_push (new_name);
10265           for (i = 1; i < const_nunits; i++)
10266             {
10267               /* Create: new_name_i = new_name + step_expr  */
10268               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
10269                                        new_name, step_expr);
10270               elts.quick_push (new_name);
10271             }
10272           /* Create a vector from [new_name_0, new_name_1, ...,
10273              new_name_nunits-1]  */
10274           vec_init = gimple_build_vector (&stmts, &elts);
10275         }
10276       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
10277         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
10278         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
10279                                  new_name, step_expr);
10280       else
10281         {
10282           /* Build:
10283                 [base, base, base, ...]
10284                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
10285           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
10286           gcc_assert (flag_associative_math);
10287           tree index = build_index_vector (step_vectype, 0, 1);
10288           tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10289                                                         new_name);
10290           tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10291                                                         step_expr);
10292           vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
10293           vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
10294                                    vec_init, step_vec);
10295           vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10296                                    vec_init, base_vec);
10297         }
10298       vec_init = gimple_convert (&stmts, vectype, vec_init);
10299
10300       if (stmts)
10301         {
10302           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10303           gcc_assert (!new_bb);
10304         }
10305     }
10306
10307
10308   /* Create the vector that holds the step of the induction.  */
10309   if (nested_in_vect_loop)
10310     /* iv_loop is nested in the loop to be vectorized. Generate:
10311        vec_step = [S, S, S, S]  */
10312     new_name = step_expr;
10313   else
10314     {
10315       /* iv_loop is the loop to be vectorized. Generate:
10316           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
10317       gimple_seq seq = NULL;
10318       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10319         {
10320           expr = build_int_cst (integer_type_node, vf);
10321           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10322         }
10323       else
10324         expr = build_int_cst (TREE_TYPE (step_expr), vf);
10325       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10326                                expr, step_expr);
10327       if (seq)
10328         {
10329           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10330           gcc_assert (!new_bb);
10331         }
10332     }
10333
10334   t = unshare_expr (new_name);
10335   gcc_assert (CONSTANT_CLASS_P (new_name)
10336               || TREE_CODE (new_name) == SSA_NAME);
10337   new_vec = build_vector_from_val (step_vectype, t);
10338   vec_step = vect_init_vector (loop_vinfo, stmt_info,
10339                                new_vec, step_vectype, NULL);
10340
10341
10342   /* Create the following def-use cycle:
10343      loop prolog:
10344          vec_init = ...
10345          vec_step = ...
10346      loop:
10347          vec_iv = PHI <vec_init, vec_loop>
10348          ...
10349          STMT
10350          ...
10351          vec_loop = vec_iv + vec_step;  */
10352
10353   /* Create the induction-phi that defines the induction-operand.  */
10354   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
10355   induction_phi = create_phi_node (vec_dest, iv_loop->header);
10356   induc_def = PHI_RESULT (induction_phi);
10357
10358   /* Create the iv update inside the loop  */
10359   stmts = NULL;
10360   vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10361   vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
10362   vec_def = gimple_convert (&stmts, vectype, vec_def);
10363   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10364   new_stmt = SSA_NAME_DEF_STMT (vec_def);
10365
10366   /* Set the arguments of the phi node:  */
10367   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10368   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10369                UNKNOWN_LOCATION);
10370
10371   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
10372   *vec_stmt = induction_phi;
10373
10374   /* In case that vectorization factor (VF) is bigger than the number
10375      of elements that we can fit in a vectype (nunits), we have to generate
10376      more than one vector stmt - i.e - we need to "unroll" the
10377      vector stmt by a factor VF/nunits.  For more details see documentation
10378      in vectorizable_operation.  */
10379
10380   if (ncopies > 1)
10381     {
10382       gimple_seq seq = NULL;
10383       /* FORNOW. This restriction should be relaxed.  */
10384       gcc_assert (!nested_in_vect_loop);
10385
10386       /* Create the vector that holds the step of the induction.  */
10387       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10388         {
10389           expr = build_int_cst (integer_type_node, nunits);
10390           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10391         }
10392       else
10393         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
10394       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10395                                expr, step_expr);
10396       if (seq)
10397         {
10398           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10399           gcc_assert (!new_bb);
10400         }
10401
10402       t = unshare_expr (new_name);
10403       gcc_assert (CONSTANT_CLASS_P (new_name)
10404                   || TREE_CODE (new_name) == SSA_NAME);
10405       new_vec = build_vector_from_val (step_vectype, t);
10406       vec_step = vect_init_vector (loop_vinfo, stmt_info,
10407                                    new_vec, step_vectype, NULL);
10408
10409       vec_def = induc_def;
10410       for (i = 1; i < ncopies + 1; i++)
10411         {
10412           /* vec_i = vec_prev + vec_step  */
10413           gimple_seq stmts = NULL;
10414           vec_def = gimple_convert (&stmts, step_vectype, vec_def);
10415           vec_def = gimple_build (&stmts,
10416                                   PLUS_EXPR, step_vectype, vec_def, vec_step);
10417           vec_def = gimple_convert (&stmts, vectype, vec_def);
10418
10419           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10420           if (i < ncopies)
10421             {
10422               new_stmt = SSA_NAME_DEF_STMT (vec_def);
10423               STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
10424             }
10425           else
10426             {
10427               /* vec_1 = vec_iv + (VF/n * S)
10428                  vec_2 = vec_1 + (VF/n * S)
10429                  ...
10430                  vec_n = vec_prev + (VF/n * S) = vec_iv + VF * S = vec_loop
10431
10432                  vec_n is used as vec_loop to save the large step register and
10433                  related operations.  */
10434               add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10435                            UNKNOWN_LOCATION);
10436             }
10437         }
10438     }
10439
10440   if (dump_enabled_p ())
10441     dump_printf_loc (MSG_NOTE, vect_location,
10442                      "transform induction: created def-use cycle: %G%G",
10443                      (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
10444
10445   return true;
10446 }
10447
10448 /* Function vectorizable_live_operation.
10449
10450    STMT_INFO computes a value that is used outside the loop.  Check if
10451    it can be supported.  */
10452
10453 bool
10454 vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
10455                              slp_tree slp_node, slp_instance slp_node_instance,
10456                              int slp_index, bool vec_stmt_p,
10457                              stmt_vector_for_cost *cost_vec)
10458 {
10459   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
10460   imm_use_iterator imm_iter;
10461   tree lhs, lhs_type, bitsize;
10462   tree vectype = (slp_node
10463                   ? SLP_TREE_VECTYPE (slp_node)
10464                   : STMT_VINFO_VECTYPE (stmt_info));
10465   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10466   int ncopies;
10467   gimple *use_stmt;
10468   auto_vec<tree> vec_oprnds;
10469   int vec_entry = 0;
10470   poly_uint64 vec_index = 0;
10471
10472   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
10473
10474   /* If a stmt of a reduction is live, vectorize it via
10475      vect_create_epilog_for_reduction.  vectorizable_reduction assessed
10476      validity so just trigger the transform here.  */
10477   if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
10478     {
10479       if (!vec_stmt_p)
10480         return true;
10481       if (slp_node)
10482         {
10483           /* For reduction chains the meta-info is attached to
10484              the group leader.  */
10485           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
10486             stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
10487           /* For SLP reductions we vectorize the epilogue for
10488              all involved stmts together.  */
10489           else if (slp_index != 0)
10490             return true;
10491         }
10492       stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
10493       gcc_assert (reduc_info->is_reduc_info);
10494       if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
10495           || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
10496         return true;
10497       vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
10498                                         slp_node_instance);
10499       return true;
10500     }
10501
10502   /* If STMT is not relevant and it is a simple assignment and its inputs are
10503      invariant then it can remain in place, unvectorized.  The original last
10504      scalar value that it computes will be used.  */
10505   if (!STMT_VINFO_RELEVANT_P (stmt_info))
10506     {
10507       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
10508       if (dump_enabled_p ())
10509         dump_printf_loc (MSG_NOTE, vect_location,
10510                          "statement is simple and uses invariant.  Leaving in "
10511                          "place.\n");
10512       return true;
10513     }
10514
10515   if (slp_node)
10516     ncopies = 1;
10517   else
10518     ncopies = vect_get_num_copies (loop_vinfo, vectype);
10519
10520   if (slp_node)
10521     {
10522       gcc_assert (slp_index >= 0);
10523
10524       /* Get the last occurrence of the scalar index from the concatenation of
10525          all the slp vectors. Calculate which slp vector it is and the index
10526          within.  */
10527       int num_scalar = SLP_TREE_LANES (slp_node);
10528       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10529       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
10530
10531       /* Calculate which vector contains the result, and which lane of
10532          that vector we need.  */
10533       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
10534         {
10535           if (dump_enabled_p ())
10536             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10537                              "Cannot determine which vector holds the"
10538                              " final result.\n");
10539           return false;
10540         }
10541     }
10542
10543   if (!vec_stmt_p)
10544     {
10545       /* No transformation required.  */
10546       if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
10547         {
10548           if (slp_node)
10549             {
10550               if (dump_enabled_p ())
10551                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10552                                  "can't operate on partial vectors "
10553                                  "because an SLP statement is live after "
10554                                  "the loop.\n");
10555               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10556             }
10557           else if (ncopies > 1)
10558             {
10559               if (dump_enabled_p ())
10560                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10561                                  "can't operate on partial vectors "
10562                                  "because ncopies is greater than 1.\n");
10563               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10564             }
10565           else
10566             {
10567               gcc_assert (ncopies == 1 && !slp_node);
10568               if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
10569                                                   OPTIMIZE_FOR_SPEED))
10570                 vect_record_loop_mask (loop_vinfo,
10571                                        &LOOP_VINFO_MASKS (loop_vinfo),
10572                                        1, vectype, NULL);
10573               else if (can_vec_extract_var_idx_p (
10574                          TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
10575                 vect_record_loop_len (loop_vinfo,
10576                                       &LOOP_VINFO_LENS (loop_vinfo),
10577                                       1, vectype, 1);
10578               else
10579                 {
10580                   if (dump_enabled_p ())
10581                     dump_printf_loc (
10582                       MSG_MISSED_OPTIMIZATION, vect_location,
10583                       "can't operate on partial vectors "
10584                       "because the target doesn't support extract "
10585                       "last reduction.\n");
10586                   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10587                 }
10588             }
10589         }
10590       /* ???  Enable for loop costing as well.  */
10591       if (!loop_vinfo)
10592         record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
10593                           0, vect_epilogue);
10594       return true;
10595     }
10596
10597   /* Use the lhs of the original scalar statement.  */
10598   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
10599   if (dump_enabled_p ())
10600     dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
10601                      "stmt %G", stmt);
10602
10603   lhs = gimple_get_lhs (stmt);
10604   lhs_type = TREE_TYPE (lhs);
10605
10606   bitsize = vector_element_bits_tree (vectype);
10607
10608   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
10609   tree vec_lhs, bitstart;
10610   gimple *vec_stmt;
10611   if (slp_node)
10612     {
10613       gcc_assert (!loop_vinfo
10614                   || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
10615                       && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)));
10616
10617       /* Get the correct slp vectorized stmt.  */
10618       vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
10619       vec_stmt = SSA_NAME_DEF_STMT (vec_lhs);
10620
10621       /* Get entry to use.  */
10622       bitstart = bitsize_int (vec_index);
10623       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
10624     }
10625   else
10626     {
10627       /* For multiple copies, get the last copy.  */
10628       vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
10629       vec_lhs = gimple_get_lhs (vec_stmt);
10630
10631       /* Get the last lane in the vector.  */
10632       bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
10633     }
10634
10635   if (loop_vinfo)
10636     {
10637       /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
10638          requirement, insert one phi node for it.  It looks like:
10639            loop;
10640          BB:
10641            # lhs' = PHI <lhs>
10642          ==>
10643            loop;
10644          BB:
10645            # vec_lhs' = PHI <vec_lhs>
10646            new_tree = lane_extract <vec_lhs', ...>;
10647            lhs' = new_tree;  */
10648
10649       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10650       basic_block exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
10651       gcc_assert (single_pred_p (exit_bb));
10652
10653       tree vec_lhs_phi = copy_ssa_name (vec_lhs);
10654       gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
10655       SET_PHI_ARG_DEF (phi, LOOP_VINFO_IV_EXIT (loop_vinfo)->dest_idx, vec_lhs);
10656
10657       gimple_seq stmts = NULL;
10658       tree new_tree;
10659       if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
10660         {
10661           /* Emit:
10662
10663                SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
10664
10665              where VEC_LHS is the vectorized live-out result and MASK is
10666              the loop mask for the final iteration.  */
10667           gcc_assert (ncopies == 1 && !slp_node);
10668           gimple_seq tem = NULL;
10669           gimple_stmt_iterator gsi = gsi_last (tem);
10670           tree len
10671             = vect_get_loop_len (loop_vinfo, &gsi,
10672                                  &LOOP_VINFO_LENS (loop_vinfo),
10673                                  1, vectype, 0, 0);
10674
10675           /* BIAS - 1.  */
10676           signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
10677           tree bias_minus_one
10678             = int_const_binop (MINUS_EXPR,
10679                                build_int_cst (TREE_TYPE (len), biasval),
10680                                build_one_cst (TREE_TYPE (len)));
10681
10682           /* LAST_INDEX = LEN + (BIAS - 1).  */
10683           tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
10684                                           len, bias_minus_one);
10685
10686           /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>.  */
10687           tree scalar_res
10688             = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
10689                             vec_lhs_phi, last_index);
10690
10691           /* Convert the extracted vector element to the scalar type.  */
10692           new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10693         }
10694       else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
10695         {
10696           /* Emit:
10697
10698                SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
10699
10700              where VEC_LHS is the vectorized live-out result and MASK is
10701              the loop mask for the final iteration.  */
10702           gcc_assert (ncopies == 1 && !slp_node);
10703           tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
10704           gimple_seq tem = NULL;
10705           gimple_stmt_iterator gsi = gsi_last (tem);
10706           tree mask = vect_get_loop_mask (loop_vinfo, &gsi,
10707                                           &LOOP_VINFO_MASKS (loop_vinfo),
10708                                           1, vectype, 0);
10709           gimple_seq_add_seq (&stmts, tem);
10710           tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
10711                                           mask, vec_lhs_phi);
10712
10713           /* Convert the extracted vector element to the scalar type.  */
10714           new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10715         }
10716       else
10717         {
10718           tree bftype = TREE_TYPE (vectype);
10719           if (VECTOR_BOOLEAN_TYPE_P (vectype))
10720             bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10721           new_tree = build3 (BIT_FIELD_REF, bftype,
10722                              vec_lhs_phi, bitsize, bitstart);
10723           new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10724                                            &stmts, true, NULL_TREE);
10725         }
10726
10727       gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
10728       if (stmts)
10729         gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
10730
10731       /* Remove existing phis that copy from lhs and create copies
10732          from new_tree.  */
10733       gimple_stmt_iterator gsi;
10734       for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi);)
10735         {
10736           gimple *phi = gsi_stmt (gsi);
10737           if ((gimple_phi_arg_def (phi, 0) == lhs))
10738             {
10739               remove_phi_node (&gsi, false);
10740               tree lhs_phi = gimple_phi_result (phi);
10741               gimple *copy = gimple_build_assign (lhs_phi, new_tree);
10742               gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
10743             }
10744           else
10745             gsi_next (&gsi);
10746         }
10747
10748       /* There a no further out-of-loop uses of lhs by LC-SSA construction.  */
10749       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10750         gcc_assert (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)));
10751     }
10752   else
10753     {
10754       /* For basic-block vectorization simply insert the lane-extraction.  */
10755       tree bftype = TREE_TYPE (vectype);
10756       if (VECTOR_BOOLEAN_TYPE_P (vectype))
10757         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10758       tree new_tree = build3 (BIT_FIELD_REF, bftype,
10759                               vec_lhs, bitsize, bitstart);
10760       gimple_seq stmts = NULL;
10761       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10762                                        &stmts, true, NULL_TREE);
10763       if (TREE_CODE (new_tree) == SSA_NAME
10764           && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
10765         SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
10766       if (is_a <gphi *> (vec_stmt))
10767         {
10768           gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
10769           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10770         }
10771       else
10772         {
10773           gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
10774           gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
10775         }
10776
10777       /* Replace use of lhs with newly computed result.  If the use stmt is a
10778          single arg PHI, just replace all uses of PHI result.  It's necessary
10779          because lcssa PHI defining lhs may be before newly inserted stmt.  */
10780       use_operand_p use_p;
10781       stmt_vec_info use_stmt_info;
10782       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10783         if (!is_gimple_debug (use_stmt)
10784             && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
10785                 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
10786           {
10787             /* ???  This can happen when the live lane ends up being
10788                rooted in a vector construction code-generated by an
10789                external SLP node (and code-generation for that already
10790                happened).  See gcc.dg/vect/bb-slp-47.c.
10791                Doing this is what would happen if that vector CTOR
10792                were not code-generated yet so it is not too bad.
10793                ???  In fact we'd likely want to avoid this situation
10794                in the first place.  */
10795             if (TREE_CODE (new_tree) == SSA_NAME
10796                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10797                 && gimple_code (use_stmt) != GIMPLE_PHI
10798                 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
10799                                                 use_stmt))
10800               {
10801                 if (dump_enabled_p ())
10802                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10803                                    "Using original scalar computation for "
10804                                    "live lane because use preceeds vector "
10805                                    "def\n");
10806                 continue;
10807               }
10808             /* ???  It can also happen that we end up pulling a def into
10809                a loop where replacing out-of-loop uses would require
10810                a new LC SSA PHI node.  Retain the original scalar in
10811                those cases as well.  PR98064.  */
10812             if (TREE_CODE (new_tree) == SSA_NAME
10813                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10814                 && (gimple_bb (use_stmt)->loop_father
10815                     != gimple_bb (vec_stmt)->loop_father)
10816                 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
10817                                         gimple_bb (use_stmt)->loop_father))
10818               {
10819                 if (dump_enabled_p ())
10820                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10821                                    "Using original scalar computation for "
10822                                    "live lane because there is an out-of-loop "
10823                                    "definition for it\n");
10824                 continue;
10825               }
10826             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10827               SET_USE (use_p, new_tree);
10828             update_stmt (use_stmt);
10829           }
10830     }
10831
10832   return true;
10833 }
10834
10835 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
10836
10837 static void
10838 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
10839 {
10840   ssa_op_iter op_iter;
10841   imm_use_iterator imm_iter;
10842   def_operand_p def_p;
10843   gimple *ustmt;
10844
10845   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
10846     {
10847       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
10848         {
10849           basic_block bb;
10850
10851           if (!is_gimple_debug (ustmt))
10852             continue;
10853
10854           bb = gimple_bb (ustmt);
10855
10856           if (!flow_bb_inside_loop_p (loop, bb))
10857             {
10858               if (gimple_debug_bind_p (ustmt))
10859                 {
10860                   if (dump_enabled_p ())
10861                     dump_printf_loc (MSG_NOTE, vect_location,
10862                                      "killing debug use\n");
10863
10864                   gimple_debug_bind_reset_value (ustmt);
10865                   update_stmt (ustmt);
10866                 }
10867               else
10868                 gcc_unreachable ();
10869             }
10870         }
10871     }
10872 }
10873
10874 /* Given loop represented by LOOP_VINFO, return true if computation of
10875    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
10876    otherwise.  */
10877
10878 static bool
10879 loop_niters_no_overflow (loop_vec_info loop_vinfo)
10880 {
10881   /* Constant case.  */
10882   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
10883     {
10884       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
10885       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
10886
10887       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
10888       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
10889       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
10890         return true;
10891     }
10892
10893   widest_int max;
10894   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10895   /* Check the upper bound of loop niters.  */
10896   if (get_max_loop_iterations (loop, &max))
10897     {
10898       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
10899       signop sgn = TYPE_SIGN (type);
10900       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
10901       if (max < type_max)
10902         return true;
10903     }
10904   return false;
10905 }
10906
10907 /* Return a mask type with half the number of elements as OLD_TYPE,
10908    given that it should have mode NEW_MODE.  */
10909
10910 tree
10911 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
10912 {
10913   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
10914   return build_truth_vector_type_for_mode (nunits, new_mode);
10915 }
10916
10917 /* Return a mask type with twice as many elements as OLD_TYPE,
10918    given that it should have mode NEW_MODE.  */
10919
10920 tree
10921 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
10922 {
10923   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
10924   return build_truth_vector_type_for_mode (nunits, new_mode);
10925 }
10926
10927 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
10928    contain a sequence of NVECTORS masks that each control a vector of type
10929    VECTYPE.  If SCALAR_MASK is nonnull, the fully-masked loop would AND
10930    these vector masks with the vector version of SCALAR_MASK.  */
10931
10932 void
10933 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
10934                        unsigned int nvectors, tree vectype, tree scalar_mask)
10935 {
10936   gcc_assert (nvectors != 0);
10937
10938   if (scalar_mask)
10939     {
10940       scalar_cond_masked_key cond (scalar_mask, nvectors);
10941       loop_vinfo->scalar_cond_masked_set.add (cond);
10942     }
10943
10944   masks->mask_set.add (std::make_pair (vectype, nvectors));
10945 }
10946
10947 /* Given a complete set of masks MASKS, extract mask number INDEX
10948    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
10949    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
10950
10951    See the comment above vec_loop_masks for more details about the mask
10952    arrangement.  */
10953
10954 tree
10955 vect_get_loop_mask (loop_vec_info loop_vinfo,
10956                     gimple_stmt_iterator *gsi, vec_loop_masks *masks,
10957                     unsigned int nvectors, tree vectype, unsigned int index)
10958 {
10959   if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
10960       == vect_partial_vectors_while_ult)
10961     {
10962       rgroup_controls *rgm = &(masks->rgc_vec)[nvectors - 1];
10963       tree mask_type = rgm->type;
10964
10965       /* Populate the rgroup's mask array, if this is the first time we've
10966          used it.  */
10967       if (rgm->controls.is_empty ())
10968         {
10969           rgm->controls.safe_grow_cleared (nvectors, true);
10970           for (unsigned int i = 0; i < nvectors; ++i)
10971             {
10972               tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
10973               /* Provide a dummy definition until the real one is available.  */
10974               SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
10975               rgm->controls[i] = mask;
10976             }
10977         }
10978
10979       tree mask = rgm->controls[index];
10980       if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
10981                     TYPE_VECTOR_SUBPARTS (vectype)))
10982         {
10983           /* A loop mask for data type X can be reused for data type Y
10984              if X has N times more elements than Y and if Y's elements
10985              are N times bigger than X's.  In this case each sequence
10986              of N elements in the loop mask will be all-zero or all-one.
10987              We can then view-convert the mask so that each sequence of
10988              N elements is replaced by a single element.  */
10989           gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
10990                                   TYPE_VECTOR_SUBPARTS (vectype)));
10991           gimple_seq seq = NULL;
10992           mask_type = truth_type_for (vectype);
10993           mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
10994           if (seq)
10995             gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10996         }
10997       return mask;
10998     }
10999   else if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
11000            == vect_partial_vectors_avx512)
11001     {
11002       /* The number of scalars per iteration and the number of vectors are
11003          both compile-time constants.  */
11004       unsigned int nscalars_per_iter
11005         = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11006                      LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
11007
11008       rgroup_controls *rgm = &masks->rgc_vec[nscalars_per_iter - 1];
11009
11010       /* The stored nV is dependent on the mask type produced.  */
11011       gcc_assert (exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11012                              TYPE_VECTOR_SUBPARTS (rgm->type)).to_constant ()
11013                   == rgm->factor);
11014       nvectors = rgm->factor;
11015
11016       /* Populate the rgroup's mask array, if this is the first time we've
11017          used it.  */
11018       if (rgm->controls.is_empty ())
11019         {
11020           rgm->controls.safe_grow_cleared (nvectors, true);
11021           for (unsigned int i = 0; i < nvectors; ++i)
11022             {
11023               tree mask = make_temp_ssa_name (rgm->type, NULL, "loop_mask");
11024               /* Provide a dummy definition until the real one is available.  */
11025               SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
11026               rgm->controls[i] = mask;
11027             }
11028         }
11029       if (known_eq (TYPE_VECTOR_SUBPARTS (rgm->type),
11030                     TYPE_VECTOR_SUBPARTS (vectype)))
11031         return rgm->controls[index];
11032
11033       /* Split the vector if needed.  Since we are dealing with integer mode
11034          masks with AVX512 we can operate on the integer representation
11035          performing the whole vector shifting.  */
11036       unsigned HOST_WIDE_INT factor;
11037       bool ok = constant_multiple_p (TYPE_VECTOR_SUBPARTS (rgm->type),
11038                                      TYPE_VECTOR_SUBPARTS (vectype), &factor);
11039       gcc_assert (ok);
11040       gcc_assert (GET_MODE_CLASS (TYPE_MODE (rgm->type)) == MODE_INT);
11041       tree mask_type = truth_type_for (vectype);
11042       gcc_assert (GET_MODE_CLASS (TYPE_MODE (mask_type)) == MODE_INT);
11043       unsigned vi = index / factor;
11044       unsigned vpart = index % factor;
11045       tree vec = rgm->controls[vi];
11046       gimple_seq seq = NULL;
11047       vec = gimple_build (&seq, VIEW_CONVERT_EXPR,
11048                           lang_hooks.types.type_for_mode
11049                                 (TYPE_MODE (rgm->type), 1), vec);
11050       /* For integer mode masks simply shift the right bits into position.  */
11051       if (vpart != 0)
11052         vec = gimple_build (&seq, RSHIFT_EXPR, TREE_TYPE (vec), vec,
11053                             build_int_cst (integer_type_node,
11054                                            (TYPE_VECTOR_SUBPARTS (vectype)
11055                                             * vpart)));
11056       vec = gimple_convert (&seq, lang_hooks.types.type_for_mode
11057                                     (TYPE_MODE (mask_type), 1), vec);
11058       vec = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, vec);
11059       if (seq)
11060         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11061       return vec;
11062     }
11063   else
11064     gcc_unreachable ();
11065 }
11066
11067 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
11068    lengths for controlling an operation on VECTYPE.  The operation splits
11069    each element of VECTYPE into FACTOR separate subelements, measuring the
11070    length as a number of these subelements.  */
11071
11072 void
11073 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
11074                       unsigned int nvectors, tree vectype, unsigned int factor)
11075 {
11076   gcc_assert (nvectors != 0);
11077   if (lens->length () < nvectors)
11078     lens->safe_grow_cleared (nvectors, true);
11079   rgroup_controls *rgl = &(*lens)[nvectors - 1];
11080
11081   /* The number of scalars per iteration, scalar occupied bytes and
11082      the number of vectors are both compile-time constants.  */
11083   unsigned int nscalars_per_iter
11084     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11085                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
11086
11087   if (rgl->max_nscalars_per_iter < nscalars_per_iter)
11088     {
11089       /* For now, we only support cases in which all loads and stores fall back
11090          to VnQI or none do.  */
11091       gcc_assert (!rgl->max_nscalars_per_iter
11092                   || (rgl->factor == 1 && factor == 1)
11093                   || (rgl->max_nscalars_per_iter * rgl->factor
11094                       == nscalars_per_iter * factor));
11095       rgl->max_nscalars_per_iter = nscalars_per_iter;
11096       rgl->type = vectype;
11097       rgl->factor = factor;
11098     }
11099 }
11100
11101 /* Given a complete set of lengths LENS, extract length number INDEX
11102    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
11103    where 0 <= INDEX < NVECTORS.  Return a value that contains FACTOR
11104    multipled by the number of elements that should be processed.
11105    Insert any set-up statements before GSI.  */
11106
11107 tree
11108 vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
11109                    vec_loop_lens *lens, unsigned int nvectors, tree vectype,
11110                    unsigned int index, unsigned int factor)
11111 {
11112   rgroup_controls *rgl = &(*lens)[nvectors - 1];
11113   bool use_bias_adjusted_len =
11114     LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
11115
11116   /* Populate the rgroup's len array, if this is the first time we've
11117      used it.  */
11118   if (rgl->controls.is_empty ())
11119     {
11120       rgl->controls.safe_grow_cleared (nvectors, true);
11121       for (unsigned int i = 0; i < nvectors; ++i)
11122         {
11123           tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
11124           gcc_assert (len_type != NULL_TREE);
11125
11126           tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
11127
11128           /* Provide a dummy definition until the real one is available.  */
11129           SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
11130           rgl->controls[i] = len;
11131
11132           if (use_bias_adjusted_len)
11133             {
11134               gcc_assert (i == 0);
11135               tree adjusted_len =
11136                 make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
11137               SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
11138               rgl->bias_adjusted_ctrl = adjusted_len;
11139             }
11140         }
11141     }
11142
11143   if (use_bias_adjusted_len)
11144     return rgl->bias_adjusted_ctrl;
11145
11146   tree loop_len = rgl->controls[index];
11147   if (rgl->factor == 1 && factor == 1)
11148     {
11149       poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
11150       poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
11151       if (maybe_ne (nunits1, nunits2))
11152         {
11153           /* A loop len for data type X can be reused for data type Y
11154              if X has N times more elements than Y and if Y's elements
11155              are N times bigger than X's.  */
11156           gcc_assert (multiple_p (nunits1, nunits2));
11157           factor = exact_div (nunits1, nunits2).to_constant ();
11158           tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
11159           gimple_seq seq = NULL;
11160           loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
11161                                    build_int_cst (iv_type, factor));
11162           if (seq)
11163             gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11164         }
11165     }
11166   return loop_len;
11167 }
11168
11169 /* Scale profiling counters by estimation for LOOP which is vectorized
11170    by factor VF.
11171    If FLAT is true, the loop we started with had unrealistically flat
11172    profile.  */
11173
11174 static void
11175 scale_profile_for_vect_loop (class loop *loop, edge exit_e, unsigned vf, bool flat)
11176 {
11177   /* For flat profiles do not scale down proportionally by VF and only
11178      cap by known iteration count bounds.  */
11179   if (flat)
11180     {
11181       if (dump_file && (dump_flags & TDF_DETAILS))
11182         fprintf (dump_file,
11183                  "Vectorized loop profile seems flat; not scaling iteration "
11184                  "count down by the vectorization factor %i\n", vf);
11185       scale_loop_profile (loop, profile_probability::always (),
11186                           get_likely_max_loop_iterations_int (loop));
11187       return;
11188     }
11189   /* Loop body executes VF fewer times and exit increases VF times.  */
11190   profile_count entry_count = loop_preheader_edge (loop)->count ();
11191
11192   /* If we have unreliable loop profile avoid dropping entry
11193      count bellow header count.  This can happen since loops
11194      has unrealistically low trip counts.  */
11195   while (vf > 1
11196          && loop->header->count > entry_count
11197          && loop->header->count < entry_count * vf)
11198     {
11199       if (dump_file && (dump_flags & TDF_DETAILS))
11200         fprintf (dump_file,
11201                  "Vectorization factor %i seems too large for profile "
11202                  "prevoiusly believed to be consistent; reducing.\n", vf);
11203       vf /= 2;
11204     }
11205
11206   if (entry_count.nonzero_p ())
11207     set_edge_probability_and_rescale_others
11208             (exit_e,
11209              entry_count.probability_in (loop->header->count / vf));
11210   /* Avoid producing very large exit probability when we do not have
11211      sensible profile.  */
11212   else if (exit_e->probability < profile_probability::always () / (vf * 2))
11213     set_edge_probability_and_rescale_others (exit_e, exit_e->probability * vf);
11214   loop->latch->count = single_pred_edge (loop->latch)->count ();
11215
11216   scale_loop_profile (loop, profile_probability::always () / vf,
11217                       get_likely_max_loop_iterations_int (loop));
11218 }
11219
11220 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
11221    latch edge values originally defined by it.  */
11222
11223 static void
11224 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
11225                                      stmt_vec_info def_stmt_info)
11226 {
11227   tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
11228   if (!def || TREE_CODE (def) != SSA_NAME)
11229     return;
11230   stmt_vec_info phi_info;
11231   imm_use_iterator iter;
11232   use_operand_p use_p;
11233   FOR_EACH_IMM_USE_FAST (use_p, iter, def)
11234     {
11235       gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p));
11236       if (!phi)
11237         continue;
11238       if (!(gimple_bb (phi)->loop_father->header == gimple_bb (phi)
11239             && (phi_info = loop_vinfo->lookup_stmt (phi))
11240             && STMT_VINFO_RELEVANT_P (phi_info)))
11241         continue;
11242       loop_p loop = gimple_bb (phi)->loop_father;
11243       edge e = loop_latch_edge (loop);
11244       if (PHI_ARG_DEF_FROM_EDGE (phi, e) != def)
11245         continue;
11246
11247       if (VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
11248           && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
11249           && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
11250         {
11251           vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11252           vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11253           gcc_assert (phi_defs.length () == latch_defs.length ());
11254           for (unsigned i = 0; i < phi_defs.length (); ++i)
11255             add_phi_arg (as_a <gphi *> (phi_defs[i]),
11256                          gimple_get_lhs (latch_defs[i]), e,
11257                          gimple_phi_arg_location (phi, e->dest_idx));
11258         }
11259       else if (STMT_VINFO_DEF_TYPE (phi_info) == vect_first_order_recurrence)
11260         {
11261           /* For first order recurrences we have to update both uses of
11262              the latch definition, the one in the PHI node and the one
11263              in the generated VEC_PERM_EXPR.  */
11264           vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11265           vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11266           gcc_assert (phi_defs.length () == latch_defs.length ());
11267           tree phidef = gimple_assign_rhs1 (phi_defs[0]);
11268           gphi *vphi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
11269           for (unsigned i = 0; i < phi_defs.length (); ++i)
11270             {
11271               gassign *perm = as_a <gassign *> (phi_defs[i]);
11272               if (i > 0)
11273                 gimple_assign_set_rhs1 (perm, gimple_get_lhs (latch_defs[i-1]));
11274               gimple_assign_set_rhs2 (perm, gimple_get_lhs (latch_defs[i]));
11275               update_stmt (perm);
11276             }
11277           add_phi_arg (vphi, gimple_get_lhs (latch_defs.last ()), e,
11278                        gimple_phi_arg_location (phi, e->dest_idx));
11279         }
11280     }
11281 }
11282
11283 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
11284    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
11285    stmt_vec_info.  */
11286
11287 static bool
11288 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
11289                           gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
11290 {
11291   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11292   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11293
11294   if (dump_enabled_p ())
11295     dump_printf_loc (MSG_NOTE, vect_location,
11296                      "------>vectorizing statement: %G", stmt_info->stmt);
11297
11298   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
11299     vect_loop_kill_debug_uses (loop, stmt_info);
11300
11301   if (!STMT_VINFO_RELEVANT_P (stmt_info)
11302       && !STMT_VINFO_LIVE_P (stmt_info))
11303     return false;
11304
11305   if (STMT_VINFO_VECTYPE (stmt_info))
11306     {
11307       poly_uint64 nunits
11308         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
11309       if (!STMT_SLP_TYPE (stmt_info)
11310           && maybe_ne (nunits, vf)
11311           && dump_enabled_p ())
11312         /* For SLP VF is set according to unrolling factor, and not
11313            to vector size, hence for SLP this print is not valid.  */
11314         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
11315     }
11316
11317   /* Pure SLP statements have already been vectorized.  We still need
11318      to apply loop vectorization to hybrid SLP statements.  */
11319   if (PURE_SLP_STMT (stmt_info))
11320     return false;
11321
11322   if (dump_enabled_p ())
11323     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
11324
11325   if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
11326     *seen_store = stmt_info;
11327
11328   return true;
11329 }
11330
11331 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
11332    in the hash_map with its corresponding values.  */
11333
11334 static tree
11335 find_in_mapping (tree t, void *context)
11336 {
11337   hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
11338
11339   tree *value = mapping->get (t);
11340   return value ? *value : t;
11341 }
11342
11343 /* Update EPILOGUE's loop_vec_info.  EPILOGUE was constructed as a copy of the
11344    original loop that has now been vectorized.
11345
11346    The inits of the data_references need to be advanced with the number of
11347    iterations of the main loop.  This has been computed in vect_do_peeling and
11348    is stored in parameter ADVANCE.  We first restore the data_references
11349    initial offset with the values recored in ORIG_DRS_INIT.
11350
11351    Since the loop_vec_info of this EPILOGUE was constructed for the original
11352    loop, its stmt_vec_infos all point to the original statements.  These need
11353    to be updated to point to their corresponding copies as well as the SSA_NAMES
11354    in their PATTERN_DEF_SEQs and RELATED_STMTs.
11355
11356    The data_reference's connections also need to be updated.  Their
11357    corresponding dr_vec_info need to be reconnected to the EPILOGUE's
11358    stmt_vec_infos, their statements need to point to their corresponding copy,
11359    if they are gather loads or scatter stores then their reference needs to be
11360    updated to point to its corresponding copy and finally we set
11361    'base_misaligned' to false as we have already peeled for alignment in the
11362    prologue of the main loop.  */
11363
11364 static void
11365 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
11366 {
11367   loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
11368   auto_vec<gimple *> stmt_worklist;
11369   hash_map<tree,tree> mapping;
11370   gimple *orig_stmt, *new_stmt;
11371   gimple_stmt_iterator epilogue_gsi;
11372   gphi_iterator epilogue_phi_gsi;
11373   stmt_vec_info stmt_vinfo = NULL, related_vinfo;
11374   basic_block *epilogue_bbs = get_loop_body (epilogue);
11375   unsigned i;
11376
11377   free (LOOP_VINFO_BBS (epilogue_vinfo));
11378   LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
11379
11380   /* Advance data_reference's with the number of iterations of the previous
11381      loop and its prologue.  */
11382   vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
11383
11384
11385   /* The EPILOGUE loop is a copy of the original loop so they share the same
11386      gimple UIDs.  In this loop we update the loop_vec_info of the EPILOGUE to
11387      point to the copied statements.  We also create a mapping of all LHS' in
11388      the original loop and all the LHS' in the EPILOGUE and create worklists to
11389      update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs.  */
11390   for (unsigned i = 0; i < epilogue->num_nodes; ++i)
11391     {
11392       for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
11393            !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
11394         {
11395           new_stmt = epilogue_phi_gsi.phi ();
11396
11397           gcc_assert (gimple_uid (new_stmt) > 0);
11398           stmt_vinfo
11399             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11400
11401           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11402           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11403
11404           mapping.put (gimple_phi_result (orig_stmt),
11405                        gimple_phi_result (new_stmt));
11406           /* PHI nodes can not have patterns or related statements.  */
11407           gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
11408                       && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
11409         }
11410
11411       for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
11412            !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
11413         {
11414           new_stmt = gsi_stmt (epilogue_gsi);
11415           if (is_gimple_debug (new_stmt))
11416             continue;
11417
11418           gcc_assert (gimple_uid (new_stmt) > 0);
11419           stmt_vinfo
11420             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11421
11422           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11423           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11424
11425           if (tree old_lhs = gimple_get_lhs (orig_stmt))
11426             mapping.put (old_lhs, gimple_get_lhs (new_stmt));
11427
11428           if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
11429             {
11430               gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
11431               for (gimple_stmt_iterator gsi = gsi_start (seq);
11432                    !gsi_end_p (gsi); gsi_next (&gsi))
11433                 stmt_worklist.safe_push (gsi_stmt (gsi));
11434             }
11435
11436           related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
11437           if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
11438             {
11439               gimple *stmt = STMT_VINFO_STMT (related_vinfo);
11440               stmt_worklist.safe_push (stmt);
11441               /* Set BB such that the assert in
11442                 'get_initial_def_for_reduction' is able to determine that
11443                 the BB of the related stmt is inside this loop.  */
11444               gimple_set_bb (stmt,
11445                              gimple_bb (new_stmt));
11446               related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
11447               gcc_assert (related_vinfo == NULL
11448                           || related_vinfo == stmt_vinfo);
11449             }
11450         }
11451     }
11452
11453   /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
11454      using the original main loop and thus need to be updated to refer to the
11455      cloned variables used in the epilogue.  */
11456   for (unsigned i = 0; i < stmt_worklist.length (); ++i)
11457     {
11458       gimple *stmt = stmt_worklist[i];
11459       tree *new_op;
11460
11461       for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
11462         {
11463           tree op = gimple_op (stmt, j);
11464           if ((new_op = mapping.get(op)))
11465             gimple_set_op (stmt, j, *new_op);
11466           else
11467             {
11468               /* PR92429: The last argument of simplify_replace_tree disables
11469                  folding when replacing arguments.  This is required as
11470                  otherwise you might end up with different statements than the
11471                  ones analyzed in vect_loop_analyze, leading to different
11472                  vectorization.  */
11473               op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
11474                                           &find_in_mapping, &mapping, false);
11475               gimple_set_op (stmt, j, op);
11476             }
11477         }
11478     }
11479
11480   struct data_reference *dr;
11481   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
11482   FOR_EACH_VEC_ELT (datarefs, i, dr)
11483     {
11484       orig_stmt = DR_STMT (dr);
11485       gcc_assert (gimple_uid (orig_stmt) > 0);
11486       stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
11487       /* Data references for gather loads and scatter stores do not use the
11488          updated offset we set using ADVANCE.  Instead we have to make sure the
11489          reference in the data references point to the corresponding copy of
11490          the original in the epilogue.  Make sure to update both
11491          gather/scatters recognized by dataref analysis and also other
11492          refs that get_load_store_type classified as VMAT_GATHER_SCATTER.  */
11493       auto vstmt_vinfo = vect_stmt_to_vectorize (stmt_vinfo);
11494       if (STMT_VINFO_MEMORY_ACCESS_TYPE (vstmt_vinfo) == VMAT_GATHER_SCATTER
11495           || STMT_VINFO_GATHER_SCATTER_P (vstmt_vinfo))
11496         {
11497           DR_REF (dr)
11498             = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
11499                                      &find_in_mapping, &mapping);
11500           DR_BASE_ADDRESS (dr)
11501             = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
11502                                      &find_in_mapping, &mapping);
11503         }
11504       DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
11505       stmt_vinfo->dr_aux.stmt = stmt_vinfo;
11506       /* The vector size of the epilogue is smaller than that of the main loop
11507          so the alignment is either the same or lower. This means the dr will
11508          thus by definition be aligned.  */
11509       STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
11510     }
11511
11512   epilogue_vinfo->shared->datarefs_copy.release ();
11513   epilogue_vinfo->shared->save_datarefs ();
11514 }
11515
11516 /* Function vect_transform_loop.
11517
11518    The analysis phase has determined that the loop is vectorizable.
11519    Vectorize the loop - created vectorized stmts to replace the scalar
11520    stmts in the loop, and update the loop exit condition.
11521    Returns scalar epilogue loop if any.  */
11522
11523 class loop *
11524 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
11525 {
11526   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11527   class loop *epilogue = NULL;
11528   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
11529   int nbbs = loop->num_nodes;
11530   int i;
11531   tree niters_vector = NULL_TREE;
11532   tree step_vector = NULL_TREE;
11533   tree niters_vector_mult_vf = NULL_TREE;
11534   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11535   unsigned int lowest_vf = constant_lower_bound (vf);
11536   gimple *stmt;
11537   bool check_profitability = false;
11538   unsigned int th;
11539   bool flat = maybe_flat_loop_profile (loop);
11540
11541   DUMP_VECT_SCOPE ("vec_transform_loop");
11542
11543   loop_vinfo->shared->check_datarefs ();
11544
11545   /* Use the more conservative vectorization threshold.  If the number
11546      of iterations is constant assume the cost check has been performed
11547      by our caller.  If the threshold makes all loops profitable that
11548      run at least the (estimated) vectorization factor number of times
11549      checking is pointless, too.  */
11550   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
11551   if (vect_apply_runtime_profitability_check_p (loop_vinfo))
11552     {
11553       if (dump_enabled_p ())
11554         dump_printf_loc (MSG_NOTE, vect_location,
11555                          "Profitability threshold is %d loop iterations.\n",
11556                          th);
11557       check_profitability = true;
11558     }
11559
11560   /* Make sure there exists a single-predecessor exit bb.  Do this before
11561      versioning.   */
11562   edge e = LOOP_VINFO_IV_EXIT (loop_vinfo);
11563   if (! single_pred_p (e->dest))
11564     {
11565       split_loop_exit_edge (e, true);
11566       if (dump_enabled_p ())
11567         dump_printf (MSG_NOTE, "split exit edge\n");
11568     }
11569
11570   /* Version the loop first, if required, so the profitability check
11571      comes first.  */
11572
11573   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
11574     {
11575       class loop *sloop
11576         = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
11577       sloop->force_vectorize = false;
11578       check_profitability = false;
11579     }
11580
11581   /* Make sure there exists a single-predecessor exit bb also on the
11582      scalar loop copy.  Do this after versioning but before peeling
11583      so CFG structure is fine for both scalar and if-converted loop
11584      to make slpeel_duplicate_current_defs_from_edges face matched
11585      loop closed PHI nodes on the exit.  */
11586   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
11587     {
11588       e = LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo);
11589       if (! single_pred_p (e->dest))
11590         {
11591           split_loop_exit_edge (e, true);
11592           if (dump_enabled_p ())
11593             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
11594         }
11595     }
11596
11597   tree niters = vect_build_loop_niters (loop_vinfo);
11598   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
11599   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
11600   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
11601   tree advance;
11602   drs_init_vec orig_drs_init;
11603
11604   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
11605                               &step_vector, &niters_vector_mult_vf, th,
11606                               check_profitability, niters_no_overflow,
11607                               &advance);
11608   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
11609       && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
11610     {
11611       /* Ifcvt duplicates loop preheader, loop body and produces an basic
11612          block after loop exit.  We need to scale all that.  */
11613       basic_block preheader
11614         = loop_preheader_edge (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->src;
11615       preheader->count
11616         = preheader->count.apply_probability
11617               (LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11618       scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
11619                               LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11620       single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->dest->count
11621         = preheader->count;
11622     }
11623
11624   if (niters_vector == NULL_TREE)
11625     {
11626       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
11627           && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
11628           && known_eq (lowest_vf, vf))
11629         {
11630           niters_vector
11631             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
11632                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
11633           step_vector = build_one_cst (TREE_TYPE (niters));
11634         }
11635       else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11636         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
11637                                      &step_vector, niters_no_overflow);
11638       else
11639         /* vect_do_peeling subtracted the number of peeled prologue
11640            iterations from LOOP_VINFO_NITERS.  */
11641         vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
11642                                      &niters_vector, &step_vector,
11643                                      niters_no_overflow);
11644     }
11645
11646   /* 1) Make sure the loop header has exactly two entries
11647      2) Make sure we have a preheader basic block.  */
11648
11649   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
11650
11651   split_edge (loop_preheader_edge (loop));
11652
11653   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11654     /* This will deal with any possible peeling.  */
11655     vect_prepare_for_masked_peels (loop_vinfo);
11656
11657   /* Schedule the SLP instances first, then handle loop vectorization
11658      below.  */
11659   if (!loop_vinfo->slp_instances.is_empty ())
11660     {
11661       DUMP_VECT_SCOPE ("scheduling SLP instances");
11662       vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
11663     }
11664
11665   /* FORNOW: the vectorizer supports only loops which body consist
11666      of one basic block (header + empty latch). When the vectorizer will
11667      support more involved loop forms, the order by which the BBs are
11668      traversed need to be reconsidered.  */
11669
11670   for (i = 0; i < nbbs; i++)
11671     {
11672       basic_block bb = bbs[i];
11673       stmt_vec_info stmt_info;
11674
11675       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
11676            gsi_next (&si))
11677         {
11678           gphi *phi = si.phi ();
11679           if (dump_enabled_p ())
11680             dump_printf_loc (MSG_NOTE, vect_location,
11681                              "------>vectorizing phi: %G", (gimple *) phi);
11682           stmt_info = loop_vinfo->lookup_stmt (phi);
11683           if (!stmt_info)
11684             continue;
11685
11686           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
11687             vect_loop_kill_debug_uses (loop, stmt_info);
11688
11689           if (!STMT_VINFO_RELEVANT_P (stmt_info)
11690               && !STMT_VINFO_LIVE_P (stmt_info))
11691             continue;
11692
11693           if (STMT_VINFO_VECTYPE (stmt_info)
11694               && (maybe_ne
11695                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
11696               && dump_enabled_p ())
11697             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
11698
11699           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
11700                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
11701                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
11702                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
11703                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence
11704                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
11705               && ! PURE_SLP_STMT (stmt_info))
11706             {
11707               if (dump_enabled_p ())
11708                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
11709               vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
11710             }
11711         }
11712
11713       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
11714            gsi_next (&si))
11715         {
11716           gphi *phi = si.phi ();
11717           stmt_info = loop_vinfo->lookup_stmt (phi);
11718           if (!stmt_info)
11719             continue;
11720
11721           if (!STMT_VINFO_RELEVANT_P (stmt_info)
11722               && !STMT_VINFO_LIVE_P (stmt_info))
11723             continue;
11724
11725           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
11726                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
11727                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
11728                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
11729                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
11730                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence)
11731               && ! PURE_SLP_STMT (stmt_info))
11732             maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
11733         }
11734
11735       for (gimple_stmt_iterator si = gsi_start_bb (bb);
11736            !gsi_end_p (si);)
11737         {
11738           stmt = gsi_stmt (si);
11739           /* During vectorization remove existing clobber stmts.  */
11740           if (gimple_clobber_p (stmt))
11741             {
11742               unlink_stmt_vdef (stmt);
11743               gsi_remove (&si, true);
11744               release_defs (stmt);
11745             }
11746           else
11747             {
11748               /* Ignore vector stmts created in the outer loop.  */
11749               stmt_info = loop_vinfo->lookup_stmt (stmt);
11750
11751               /* vector stmts created in the outer-loop during vectorization of
11752                  stmts in an inner-loop may not have a stmt_info, and do not
11753                  need to be vectorized.  */
11754               stmt_vec_info seen_store = NULL;
11755               if (stmt_info)
11756                 {
11757                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
11758                     {
11759                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
11760                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
11761                            !gsi_end_p (subsi); gsi_next (&subsi))
11762                         {
11763                           stmt_vec_info pat_stmt_info
11764                             = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
11765                           vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
11766                                                     &si, &seen_store);
11767                         }
11768                       stmt_vec_info pat_stmt_info
11769                         = STMT_VINFO_RELATED_STMT (stmt_info);
11770                       if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
11771                                                     &si, &seen_store))
11772                         maybe_set_vectorized_backedge_value (loop_vinfo,
11773                                                              pat_stmt_info);
11774                     }
11775                   else
11776                     {
11777                       if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
11778                                                     &seen_store))
11779                         maybe_set_vectorized_backedge_value (loop_vinfo,
11780                                                              stmt_info);
11781                     }
11782                 }
11783               gsi_next (&si);
11784               if (seen_store)
11785                 {
11786                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
11787                     /* Interleaving.  If IS_STORE is TRUE, the
11788                        vectorization of the interleaving chain was
11789                        completed - free all the stores in the chain.  */
11790                     vect_remove_stores (loop_vinfo,
11791                                         DR_GROUP_FIRST_ELEMENT (seen_store));
11792                   else
11793                     /* Free the attached stmt_vec_info and remove the stmt.  */
11794                     loop_vinfo->remove_stmt (stmt_info);
11795                 }
11796             }
11797         }
11798
11799       /* Stub out scalar statements that must not survive vectorization.
11800          Doing this here helps with grouped statements, or statements that
11801          are involved in patterns.  */
11802       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
11803            !gsi_end_p (gsi); gsi_next (&gsi))
11804         {
11805           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
11806           if (!call || !gimple_call_internal_p (call))
11807             continue;
11808           internal_fn ifn = gimple_call_internal_fn (call);
11809           if (ifn == IFN_MASK_LOAD)
11810             {
11811               tree lhs = gimple_get_lhs (call);
11812               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11813                 {
11814                   tree zero = build_zero_cst (TREE_TYPE (lhs));
11815                   gimple *new_stmt = gimple_build_assign (lhs, zero);
11816                   gsi_replace (&gsi, new_stmt, true);
11817                 }
11818             }
11819           else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
11820             {
11821               tree lhs = gimple_get_lhs (call);
11822               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11823                 {
11824                   tree else_arg
11825                     = gimple_call_arg (call, gimple_call_num_args (call) - 1);
11826                   gimple *new_stmt = gimple_build_assign (lhs, else_arg);
11827                   gsi_replace (&gsi, new_stmt, true);
11828                 }
11829             }
11830         }
11831     }                           /* BBs in loop */
11832
11833   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
11834      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
11835   if (integer_onep (step_vector))
11836     niters_no_overflow = true;
11837   vect_set_loop_condition (loop, LOOP_VINFO_IV_EXIT (loop_vinfo), loop_vinfo,
11838                            niters_vector, step_vector, niters_vector_mult_vf,
11839                            !niters_no_overflow);
11840
11841   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
11842
11843   /* True if the final iteration might not handle a full vector's
11844      worth of scalar iterations.  */
11845   bool final_iter_may_be_partial
11846     = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
11847   /* The minimum number of iterations performed by the epilogue.  This
11848      is 1 when peeling for gaps because we always need a final scalar
11849      iteration.  */
11850   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
11851   /* +1 to convert latch counts to loop iteration counts,
11852      -min_epilogue_iters to remove iterations that cannot be performed
11853        by the vector code.  */
11854   int bias_for_lowest = 1 - min_epilogue_iters;
11855   int bias_for_assumed = bias_for_lowest;
11856   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
11857   if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
11858     {
11859       /* When the amount of peeling is known at compile time, the first
11860          iteration will have exactly alignment_npeels active elements.
11861          In the worst case it will have at least one.  */
11862       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
11863       bias_for_lowest += lowest_vf - min_first_active;
11864       bias_for_assumed += assumed_vf - min_first_active;
11865     }
11866   /* In these calculations the "- 1" converts loop iteration counts
11867      back to latch counts.  */
11868   if (loop->any_upper_bound)
11869     {
11870       loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
11871       loop->nb_iterations_upper_bound
11872         = (final_iter_may_be_partial
11873            ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
11874                             lowest_vf) - 1
11875            : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
11876                              lowest_vf) - 1);
11877       if (main_vinfo
11878           /* Both peeling for alignment and peeling for gaps can end up
11879              with the scalar epilogue running for more than VF-1 iterations.  */
11880           && !main_vinfo->peeling_for_alignment
11881           && !main_vinfo->peeling_for_gaps)
11882         {
11883           unsigned int bound;
11884           poly_uint64 main_iters
11885             = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
11886                            LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
11887           main_iters
11888             = upper_bound (main_iters,
11889                            LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
11890           if (can_div_away_from_zero_p (main_iters,
11891                                         LOOP_VINFO_VECT_FACTOR (loop_vinfo),
11892                                         &bound))
11893             loop->nb_iterations_upper_bound
11894               = wi::umin ((bound_wide_int) (bound - 1),
11895                           loop->nb_iterations_upper_bound);
11896       }
11897   }
11898   if (loop->any_likely_upper_bound)
11899     loop->nb_iterations_likely_upper_bound
11900       = (final_iter_may_be_partial
11901          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
11902                           + bias_for_lowest, lowest_vf) - 1
11903          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
11904                            + bias_for_lowest, lowest_vf) - 1);
11905   if (loop->any_estimate)
11906     loop->nb_iterations_estimate
11907       = (final_iter_may_be_partial
11908          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
11909                           assumed_vf) - 1
11910          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
11911                            assumed_vf) - 1);
11912   scale_profile_for_vect_loop (loop, LOOP_VINFO_IV_EXIT (loop_vinfo),
11913                                assumed_vf, flat);
11914
11915   if (dump_enabled_p ())
11916     {
11917       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
11918         {
11919           dump_printf_loc (MSG_NOTE, vect_location,
11920                            "LOOP VECTORIZED\n");
11921           if (loop->inner)
11922             dump_printf_loc (MSG_NOTE, vect_location,
11923                              "OUTER LOOP VECTORIZED\n");
11924           dump_printf (MSG_NOTE, "\n");
11925         }
11926       else
11927         dump_printf_loc (MSG_NOTE, vect_location,
11928                          "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
11929                          GET_MODE_NAME (loop_vinfo->vector_mode));
11930     }
11931
11932   /* Loops vectorized with a variable factor won't benefit from
11933      unrolling/peeling.  */
11934   if (!vf.is_constant ())
11935     {
11936       loop->unroll = 1;
11937       if (dump_enabled_p ())
11938         dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
11939                          " variable-length vectorization factor\n");
11940     }
11941   /* Free SLP instances here because otherwise stmt reference counting
11942      won't work.  */
11943   slp_instance instance;
11944   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
11945     vect_free_slp_instance (instance);
11946   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
11947   /* Clear-up safelen field since its value is invalid after vectorization
11948      since vectorized loop can have loop-carried dependencies.  */
11949   loop->safelen = 0;
11950
11951   if (epilogue)
11952     {
11953       update_epilogue_loop_vinfo (epilogue, advance);
11954
11955       epilogue->simduid = loop->simduid;
11956       epilogue->force_vectorize = loop->force_vectorize;
11957       epilogue->dont_vectorize = false;
11958     }
11959
11960   return epilogue;
11961 }
11962
11963 /* The code below is trying to perform simple optimization - revert
11964    if-conversion for masked stores, i.e. if the mask of a store is zero
11965    do not perform it and all stored value producers also if possible.
11966    For example,
11967      for (i=0; i<n; i++)
11968        if (c[i])
11969         {
11970           p1[i] += 1;
11971           p2[i] = p3[i] +2;
11972         }
11973    this transformation will produce the following semi-hammock:
11974
11975    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
11976      {
11977        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
11978        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
11979        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
11980        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
11981        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
11982        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
11983      }
11984 */
11985
11986 void
11987 optimize_mask_stores (class loop *loop)
11988 {
11989   basic_block *bbs = get_loop_body (loop);
11990   unsigned nbbs = loop->num_nodes;
11991   unsigned i;
11992   basic_block bb;
11993   class loop *bb_loop;
11994   gimple_stmt_iterator gsi;
11995   gimple *stmt;
11996   auto_vec<gimple *> worklist;
11997   auto_purge_vect_location sentinel;
11998
11999   vect_location = find_loop_location (loop);
12000   /* Pick up all masked stores in loop if any.  */
12001   for (i = 0; i < nbbs; i++)
12002     {
12003       bb = bbs[i];
12004       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
12005            gsi_next (&gsi))
12006         {
12007           stmt = gsi_stmt (gsi);
12008           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
12009             worklist.safe_push (stmt);
12010         }
12011     }
12012
12013   free (bbs);
12014   if (worklist.is_empty ())
12015     return;
12016
12017   /* Loop has masked stores.  */
12018   while (!worklist.is_empty ())
12019     {
12020       gimple *last, *last_store;
12021       edge e, efalse;
12022       tree mask;
12023       basic_block store_bb, join_bb;
12024       gimple_stmt_iterator gsi_to;
12025       tree vdef, new_vdef;
12026       gphi *phi;
12027       tree vectype;
12028       tree zero;
12029
12030       last = worklist.pop ();
12031       mask = gimple_call_arg (last, 2);
12032       bb = gimple_bb (last);
12033       /* Create then_bb and if-then structure in CFG, then_bb belongs to
12034          the same loop as if_bb.  It could be different to LOOP when two
12035          level loop-nest is vectorized and mask_store belongs to the inner
12036          one.  */
12037       e = split_block (bb, last);
12038       bb_loop = bb->loop_father;
12039       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
12040       join_bb = e->dest;
12041       store_bb = create_empty_bb (bb);
12042       add_bb_to_loop (store_bb, bb_loop);
12043       e->flags = EDGE_TRUE_VALUE;
12044       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
12045       /* Put STORE_BB to likely part.  */
12046       efalse->probability = profile_probability::likely ();
12047       e->probability = efalse->probability.invert ();
12048       store_bb->count = efalse->count ();
12049       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
12050       if (dom_info_available_p (CDI_DOMINATORS))
12051         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
12052       if (dump_enabled_p ())
12053         dump_printf_loc (MSG_NOTE, vect_location,
12054                          "Create new block %d to sink mask stores.",
12055                          store_bb->index);
12056       /* Create vector comparison with boolean result.  */
12057       vectype = TREE_TYPE (mask);
12058       zero = build_zero_cst (vectype);
12059       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
12060       gsi = gsi_last_bb (bb);
12061       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
12062       /* Create new PHI node for vdef of the last masked store:
12063          .MEM_2 = VDEF <.MEM_1>
12064          will be converted to
12065          .MEM.3 = VDEF <.MEM_1>
12066          and new PHI node will be created in join bb
12067          .MEM_2 = PHI <.MEM_1, .MEM_3>
12068       */
12069       vdef = gimple_vdef (last);
12070       new_vdef = make_ssa_name (gimple_vop (cfun), last);
12071       gimple_set_vdef (last, new_vdef);
12072       phi = create_phi_node (vdef, join_bb);
12073       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
12074
12075       /* Put all masked stores with the same mask to STORE_BB if possible.  */
12076       while (true)
12077         {
12078           gimple_stmt_iterator gsi_from;
12079           gimple *stmt1 = NULL;
12080
12081           /* Move masked store to STORE_BB.  */
12082           last_store = last;
12083           gsi = gsi_for_stmt (last);
12084           gsi_from = gsi;
12085           /* Shift GSI to the previous stmt for further traversal.  */
12086           gsi_prev (&gsi);
12087           gsi_to = gsi_start_bb (store_bb);
12088           gsi_move_before (&gsi_from, &gsi_to);
12089           /* Setup GSI_TO to the non-empty block start.  */
12090           gsi_to = gsi_start_bb (store_bb);
12091           if (dump_enabled_p ())
12092             dump_printf_loc (MSG_NOTE, vect_location,
12093                              "Move stmt to created bb\n%G", last);
12094           /* Move all stored value producers if possible.  */
12095           while (!gsi_end_p (gsi))
12096             {
12097               tree lhs;
12098               imm_use_iterator imm_iter;
12099               use_operand_p use_p;
12100               bool res;
12101
12102               /* Skip debug statements.  */
12103               if (is_gimple_debug (gsi_stmt (gsi)))
12104                 {
12105                   gsi_prev (&gsi);
12106                   continue;
12107                 }
12108               stmt1 = gsi_stmt (gsi);
12109               /* Do not consider statements writing to memory or having
12110                  volatile operand.  */
12111               if (gimple_vdef (stmt1)
12112                   || gimple_has_volatile_ops (stmt1))
12113                 break;
12114               gsi_from = gsi;
12115               gsi_prev (&gsi);
12116               lhs = gimple_get_lhs (stmt1);
12117               if (!lhs)
12118                 break;
12119
12120               /* LHS of vectorized stmt must be SSA_NAME.  */
12121               if (TREE_CODE (lhs) != SSA_NAME)
12122                 break;
12123
12124               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12125                 {
12126                   /* Remove dead scalar statement.  */
12127                   if (has_zero_uses (lhs))
12128                     {
12129                       gsi_remove (&gsi_from, true);
12130                       continue;
12131                     }
12132                 }
12133
12134               /* Check that LHS does not have uses outside of STORE_BB.  */
12135               res = true;
12136               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
12137                 {
12138                   gimple *use_stmt;
12139                   use_stmt = USE_STMT (use_p);
12140                   if (is_gimple_debug (use_stmt))
12141                     continue;
12142                   if (gimple_bb (use_stmt) != store_bb)
12143                     {
12144                       res = false;
12145                       break;
12146                     }
12147                 }
12148               if (!res)
12149                 break;
12150
12151               if (gimple_vuse (stmt1)
12152                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
12153                 break;
12154
12155               /* Can move STMT1 to STORE_BB.  */
12156               if (dump_enabled_p ())
12157                 dump_printf_loc (MSG_NOTE, vect_location,
12158                                  "Move stmt to created bb\n%G", stmt1);
12159               gsi_move_before (&gsi_from, &gsi_to);
12160               /* Shift GSI_TO for further insertion.  */
12161               gsi_prev (&gsi_to);
12162             }
12163           /* Put other masked stores with the same mask to STORE_BB.  */
12164           if (worklist.is_empty ()
12165               || gimple_call_arg (worklist.last (), 2) != mask
12166               || worklist.last () != stmt1)
12167             break;
12168           last = worklist.pop ();
12169         }
12170       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
12171     }
12172 }
12173
12174 /* Decide whether it is possible to use a zero-based induction variable
12175    when vectorizing LOOP_VINFO with partial vectors.  If it is, return
12176    the value that the induction variable must be able to hold in order
12177    to ensure that the rgroups eventually have no active vector elements.
12178    Return -1 otherwise.  */
12179
12180 widest_int
12181 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
12182 {
12183   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
12184   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
12185   unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
12186
12187   /* Calculate the value that the induction variable must be able
12188      to hit in order to ensure that we end the loop with an all-false mask.
12189      This involves adding the maximum number of inactive trailing scalar
12190      iterations.  */
12191   widest_int iv_limit = -1;
12192   if (max_loop_iterations (loop, &iv_limit))
12193     {
12194       if (niters_skip)
12195         {
12196           /* Add the maximum number of skipped iterations to the
12197              maximum iteration count.  */
12198           if (TREE_CODE (niters_skip) == INTEGER_CST)
12199             iv_limit += wi::to_widest (niters_skip);
12200           else
12201             iv_limit += max_vf - 1;
12202         }
12203       else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
12204         /* Make a conservatively-correct assumption.  */
12205         iv_limit += max_vf - 1;
12206
12207       /* IV_LIMIT is the maximum number of latch iterations, which is also
12208          the maximum in-range IV value.  Round this value down to the previous
12209          vector alignment boundary and then add an extra full iteration.  */
12210       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
12211       iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
12212     }
12213   return iv_limit;
12214 }
12215
12216 /* For the given rgroup_controls RGC, check whether an induction variable
12217    would ever hit a value that produces a set of all-false masks or zero
12218    lengths before wrapping around.  Return true if it's possible to wrap
12219    around before hitting the desirable value, otherwise return false.  */
12220
12221 bool
12222 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
12223 {
12224   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
12225
12226   if (iv_limit == -1)
12227     return true;
12228
12229   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
12230   unsigned int compare_precision = TYPE_PRECISION (compare_type);
12231   unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
12232
12233   if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
12234     return true;
12235
12236   return false;
12237 }