gcc/tree-vect-loop.cc

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2023 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #define INCLUDE_ALGORITHM
  23 #include "config.h"
  24 #include "system.h"
  25 #include "coretypes.h"
  26 #include "backend.h"
  27 #include "target.h"
  28 #include "rtl.h"
  29 #include "tree.h"
  30 #include "gimple.h"
  31 #include "cfghooks.h"
  32 #include "tree-pass.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "memmodel.h"
  36 #include "optabs.h"
  37 #include "diagnostic-core.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "cfganal.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop-niter.h"
  47 #include "tree-ssa-loop.h"
  48 #include "cfgloop.h"
  49 #include "tree-scalar-evolution.h"
  50 #include "tree-vectorizer.h"
  51 #include "gimple-fold.h"
  52 #include "cgraph.h"
  53 #include "tree-cfg.h"
  54 #include "tree-if-conv.h"
  55 #include "internal-fn.h"
  56 #include "tree-vector-builder.h"
  57 #include "vec-perm-indices.h"
  58 #include "tree-eh.h"
  59 #include "case-cfn-macros.h"
  60 #include "langhooks.h"
  61
  62 /* Loop Vectorization Pass.
  63
  64    This pass tries to vectorize loops.
  65
  66    For example, the vectorizer transforms the following simple loop:
  67
  68         short a[N]; short b[N]; short c[N]; int i;
  69
  70         for (i=0; i<N; i++){
  71           a[i] = b[i] + c[i];
  72         }
  73
  74    as if it was manually vectorized by rewriting the source code into:
  75
  76         typedef int __attribute__((mode(V8HI))) v8hi;
  77         short a[N];  short b[N]; short c[N];   int i;
  78         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  79         v8hi va, vb, vc;
  80
  81         for (i=0; i<N/8; i++){
  82           vb = pb[i];
  83           vc = pc[i];
  84           va = vb + vc;
  85           pa[i] = va;
  86         }
  87
  88         The main entry to this pass is vectorize_loops(), in which
  89    the vectorizer applies a set of analyses on a given set of loops,
  90    followed by the actual vectorization transformation for the loops that
  91    had successfully passed the analysis phase.
  92         Throughout this pass we make a distinction between two types of
  93    data: scalars (which are represented by SSA_NAMES), and memory references
  94    ("data-refs").  These two types of data require different handling both
  95    during analysis and transformation. The types of data-refs that the
  96    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  97    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  98    accesses are required to have a simple (consecutive) access pattern.
  99
 100    Analysis phase:
 101    ===============
 102         The driver for the analysis phase is vect_analyze_loop().
 103    It applies a set of analyses, some of which rely on the scalar evolution
 104    analyzer (scev) developed by Sebastian Pop.
 105
 106         During the analysis phase the vectorizer records some information
 107    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 108    loop, as well as general information about the loop as a whole, which is
 109    recorded in a "loop_vec_info" struct attached to each loop.
 110
 111    Transformation phase:
 112    =====================
 113         The loop transformation phase scans all the stmts in the loop, and
 114    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 115    the loop that needs to be vectorized.  It inserts the vector code sequence
 116    just before the scalar stmt S, and records a pointer to the vector code
 117    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 118    attached to S).  This pointer will be used for the vectorization of following
 119    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 120    otherwise, we rely on dead code elimination for removing it.
 121
 122         For example, say stmt S1 was vectorized into stmt VS1:
 123
 124    VS1: vb = px[i];
 125    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 126    S2:  a = b;
 127
 128    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 129    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 130    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 131    resulting sequence would be:
 132
 133    VS1: vb = px[i];
 134    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 135    VS2: va = vb;
 136    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 137
 138         Operands that are not SSA_NAMEs, are data-refs that appear in
 139    load/store operations (like 'x[i]' in S1), and are handled differently.
 140
 141    Target modeling:
 142    =================
 143         Currently the only target specific information that is used is the
 144    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 145    Targets that can support different sizes of vectors, for now will need
 146    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 147    flexibility will be added in the future.
 148
 149         Since we only vectorize operations which vector form can be
 150    expressed using existing tree codes, to verify that an operation is
 151    supported, the vectorizer checks the relevant optab at the relevant
 152    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 153    the value found is CODE_FOR_nothing, then there's no target support, and
 154    we can't vectorize the stmt.
 155
 156    For additional information on this project see:
 157    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 158 */
 159
 160 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
 161                                                 unsigned *);
 162 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
 163                                                bool *, bool *, bool);
 164
 165 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 166    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 167    may already be set for general statements (not just data refs).  */
 168
 169 static opt_result
 170 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
 171                               bool vectype_maybe_set_p,
 172                               poly_uint64 *vf)
 173 {
 174   gimple *stmt = stmt_info->stmt;
 175
 176   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 177        && !STMT_VINFO_LIVE_P (stmt_info))
 178       || gimple_clobber_p (stmt))
 179     {
 180       if (dump_enabled_p ())
 181         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 182       return opt_result::success ();
 183     }
 184
 185   tree stmt_vectype, nunits_vectype;
 186   opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
 187                                                    &stmt_vectype,
 188                                                    &nunits_vectype);
 189   if (!res)
 190     return res;
 191
 192   if (stmt_vectype)
 193     {
 194       if (STMT_VINFO_VECTYPE (stmt_info))
 195         /* The only case when a vectype had been already set is for stmts
 196            that contain a data ref, or for "pattern-stmts" (stmts generated
 197            by the vectorizer to represent/replace a certain idiom).  */
 198         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 199                      || vectype_maybe_set_p)
 200                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 201       else
 202         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 203     }
 204
 205   if (nunits_vectype)
 206     vect_update_max_nunits (vf, nunits_vectype);
 207
 208   return opt_result::success ();
 209 }
 210
 211 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 212    types of STMT_INFO and all attached pattern statements and update
 213    the vectorization factor VF accordingly.  Return true on success
 214    or false if something prevented vectorization.  */
 215
 216 static opt_result
 217 vect_determine_vf_for_stmt (vec_info *vinfo,
 218                             stmt_vec_info stmt_info, poly_uint64 *vf)
 219 {
 220   if (dump_enabled_p ())
 221     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
 222                      stmt_info->stmt);
 223   opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
 224   if (!res)
 225     return res;
 226
 227   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 228       && STMT_VINFO_RELATED_STMT (stmt_info))
 229     {
 230       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 231       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
 232
 233       /* If a pattern statement has def stmts, analyze them too.  */
 234       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 235            !gsi_end_p (si); gsi_next (&si))
 236         {
 237           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 238           if (dump_enabled_p ())
 239             dump_printf_loc (MSG_NOTE, vect_location,
 240                              "==> examining pattern def stmt: %G",
 241                              def_stmt_info->stmt);
 242           res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
 243           if (!res)
 244             return res;
 245         }
 246
 247       if (dump_enabled_p ())
 248         dump_printf_loc (MSG_NOTE, vect_location,
 249                          "==> examining pattern statement: %G",
 250                          stmt_info->stmt);
 251       res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
 252       if (!res)
 253         return res;
 254     }
 255
 256   return opt_result::success ();
 257 }
 258
 259 /* Function vect_determine_vectorization_factor
 260
 261    Determine the vectorization factor (VF).  VF is the number of data elements
 262    that are operated upon in parallel in a single iteration of the vectorized
 263    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 264    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 265    elements can fit in a single vector register.
 266
 267    We currently support vectorization of loops in which all types operated upon
 268    are of the same size.  Therefore this function currently sets VF according to
 269    the size of the types operated upon, and fails if there are multiple sizes
 270    in the loop.
 271
 272    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 273    original loop:
 274         for (i=0; i<N; i++){
 275           a[i] = b[i] + c[i];
 276         }
 277
 278    vectorized loop:
 279         for (i=0; i<N; i+=VF){
 280           a[i:VF] = b[i:VF] + c[i:VF];
 281         }
 282 */
 283
 284 static opt_result
 285 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 286 {
 287   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 288   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 289   unsigned nbbs = loop->num_nodes;
 290   poly_uint64 vectorization_factor = 1;
 291   tree scalar_type = NULL_TREE;
 292   gphi *phi;
 293   tree vectype;
 294   stmt_vec_info stmt_info;
 295   unsigned i;
 296
 297   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 298
 299   for (i = 0; i < nbbs; i++)
 300     {
 301       basic_block bb = bbs[i];
 302
 303       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 304            gsi_next (&si))
 305         {
 306           phi = si.phi ();
 307           stmt_info = loop_vinfo->lookup_stmt (phi);
 308           if (dump_enabled_p ())
 309             dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
 310                              (gimple *) phi);
 311
 312           gcc_assert (stmt_info);
 313
 314           if (STMT_VINFO_RELEVANT_P (stmt_info)
 315               || STMT_VINFO_LIVE_P (stmt_info))
 316             {
 317               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 318               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 319
 320               if (dump_enabled_p ())
 321                 dump_printf_loc (MSG_NOTE, vect_location,
 322                                  "get vectype for scalar type:  %T\n",
 323                                  scalar_type);
 324
 325               vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 326               if (!vectype)
 327                 return opt_result::failure_at (phi,
 328                                                "not vectorized: unsupported "
 329                                                "data-type %T\n",
 330                                                scalar_type);
 331               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 332
 333               if (dump_enabled_p ())
 334                 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
 335                                  vectype);
 336
 337               if (dump_enabled_p ())
 338                 {
 339                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 340                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 341                   dump_printf (MSG_NOTE, "\n");
 342                 }
 343
 344               vect_update_max_nunits (&vectorization_factor, vectype);
 345             }
 346         }
 347
 348       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 349            gsi_next (&si))
 350         {
 351           if (is_gimple_debug (gsi_stmt (si)))
 352             continue;
 353           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 354           opt_result res
 355             = vect_determine_vf_for_stmt (loop_vinfo,
 356                                           stmt_info, &vectorization_factor);
 357           if (!res)
 358             return res;
 359         }
 360     }
 361
 362   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 363   if (dump_enabled_p ())
 364     {
 365       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 366       dump_dec (MSG_NOTE, vectorization_factor);
 367       dump_printf (MSG_NOTE, "\n");
 368     }
 369
 370   if (known_le (vectorization_factor, 1U))
 371     return opt_result::failure_at (vect_location,
 372                                    "not vectorized: unsupported data-type\n");
 373   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 374   return opt_result::success ();
 375 }
 376
 377
 378 /* Function vect_is_simple_iv_evolution.
 379
 380    FORNOW: A simple evolution of an induction variables in the loop is
 381    considered a polynomial evolution.  */
 382
 383 static bool
 384 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 385                              tree * step)
 386 {
 387   tree init_expr;
 388   tree step_expr;
 389   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 390   basic_block bb;
 391
 392   /* When there is no evolution in this loop, the evolution function
 393      is not "simple".  */
 394   if (evolution_part == NULL_TREE)
 395     return false;
 396
 397   /* When the evolution is a polynomial of degree >= 2
 398      the evolution function is not "simple".  */
 399   if (tree_is_chrec (evolution_part))
 400     return false;
 401
 402   step_expr = evolution_part;
 403   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 404
 405   if (dump_enabled_p ())
 406     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
 407                      step_expr, init_expr);
 408
 409   *init = init_expr;
 410   *step = step_expr;
 411
 412   if (TREE_CODE (step_expr) != INTEGER_CST
 413       && (TREE_CODE (step_expr) != SSA_NAME
 414           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 415               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 416           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 417               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 418                   || !flag_associative_math)))
 419       && (TREE_CODE (step_expr) != REAL_CST
 420           || !flag_associative_math))
 421     {
 422       if (dump_enabled_p ())
 423         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 424                          "step unknown.\n");
 425       return false;
 426     }
 427
 428   return true;
 429 }
 430
 431 /* Function vect_is_nonlinear_iv_evolution
 432
 433    Only support nonlinear induction for integer type
 434    1. neg
 435    2. mul by constant
 436    3. lshift/rshift by constant.
 437
 438    For neg induction, return a fake step as integer -1.  */
 439 static bool
 440 vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
 441                                 gphi* loop_phi_node, tree *init, tree *step)
 442 {
 443   tree init_expr, ev_expr, result, op1, op2;
 444   gimple* def;
 445
 446   if (gimple_phi_num_args (loop_phi_node) != 2)
 447     return false;
 448
 449   init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop));
 450   ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop));
 451
 452   /* Support nonlinear induction only for integer type.  */
 453   if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
 454     return false;
 455
 456   *init = init_expr;
 457   result = PHI_RESULT (loop_phi_node);
 458
 459   if (TREE_CODE (ev_expr) != SSA_NAME
 460       || ((def = SSA_NAME_DEF_STMT (ev_expr)), false)
 461       || !is_gimple_assign (def))
 462     return false;
 463
 464   enum tree_code t_code = gimple_assign_rhs_code (def);
 465   switch (t_code)
 466     {
 467     case NEGATE_EXPR:
 468       if (gimple_assign_rhs1 (def) != result)
 469         return false;
 470       *step = build_int_cst (TREE_TYPE (init_expr), -1);
 471       STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
 472       break;
 473
 474     case RSHIFT_EXPR:
 475     case LSHIFT_EXPR:
 476     case MULT_EXPR:
 477       op1 = gimple_assign_rhs1 (def);
 478       op2 = gimple_assign_rhs2 (def);
 479       if (TREE_CODE (op2) != INTEGER_CST
 480           || op1 != result)
 481         return false;
 482       *step = op2;
 483       if (t_code == LSHIFT_EXPR)
 484         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
 485       else if (t_code == RSHIFT_EXPR)
 486         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
 487       /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul.  */
 488       else
 489         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
 490       break;
 491
 492     default:
 493       return false;
 494     }
 495
 496   STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = *init;
 497   STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = *step;
 498
 499   return true;
 500 }
 501
 502 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
 503    what we are assuming is a double reduction.  For example, given
 504    a structure like this:
 505
 506       outer1:
 507         x_1 = PHI <x_4(outer2), ...>;
 508         ...
 509
 510       inner:
 511         x_2 = PHI <x_1(outer1), ...>;
 512         ...
 513         x_3 = ...;
 514         ...
 515
 516       outer2:
 517         x_4 = PHI <x_3(inner)>;
 518         ...
 519
 520    outer loop analysis would treat x_1 as a double reduction phi and
 521    this function would then return true for x_2.  */
 522
 523 static bool
 524 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
 525 {
 526   use_operand_p use_p;
 527   ssa_op_iter op_iter;
 528   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
 529     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
 530       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
 531         return true;
 532   return false;
 533 }
 534
 535 /* Returns true if Phi is a first-order recurrence. A first-order
 536    recurrence is a non-reduction recurrence relation in which the value of
 537    the recurrence in the current loop iteration equals a value defined in
 538    the previous iteration.  */
 539
 540 static bool
 541 vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop,
 542                                    gphi *phi)
 543 {
 544   /* A nested cycle isn't vectorizable as first order recurrence.  */
 545   if (LOOP_VINFO_LOOP (loop_vinfo) != loop)
 546     return false;
 547
 548   /* Ensure the loop latch definition is from within the loop.  */
 549   edge latch = loop_latch_edge (loop);
 550   tree ldef = PHI_ARG_DEF_FROM_EDGE (phi, latch);
 551   if (TREE_CODE (ldef) != SSA_NAME
 552       || SSA_NAME_IS_DEFAULT_DEF (ldef)
 553       || is_a <gphi *> (SSA_NAME_DEF_STMT (ldef))
 554       || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (ldef))))
 555     return false;
 556
 557   tree def = gimple_phi_result (phi);
 558
 559   /* Ensure every use_stmt of the phi node is dominated by the latch
 560      definition.  */
 561   imm_use_iterator imm_iter;
 562   use_operand_p use_p;
 563   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
 564     if (!is_gimple_debug (USE_STMT (use_p))
 565         && (SSA_NAME_DEF_STMT (ldef) == USE_STMT (use_p)
 566             || !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (ldef),
 567                                             USE_STMT (use_p))))
 568       return false;
 569
 570   /* First-order recurrence autovectorization needs shuffle vector.  */
 571   tree scalar_type = TREE_TYPE (def);
 572   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 573   if (!vectype)
 574     return false;
 575
 576   return true;
 577 }
 578
 579 /* Function vect_analyze_scalar_cycles_1.
 580
 581    Examine the cross iteration def-use cycles of scalar variables
 582    in LOOP.  LOOP_VINFO represents the loop that is now being
 583    considered for vectorization (can be LOOP, or an outer-loop
 584    enclosing LOOP).  SLP indicates there will be some subsequent
 585    slp analyses or not.  */
 586
 587 static void
 588 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop,
 589                               bool slp)
 590 {
 591   basic_block bb = loop->header;
 592   tree init, step;
 593   auto_vec<stmt_vec_info, 64> worklist;
 594   gphi_iterator gsi;
 595   bool double_reduc, reduc_chain;
 596
 597   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 598
 599   /* First - identify all inductions.  Reduction detection assumes that all the
 600      inductions have been identified, therefore, this order must not be
 601      changed.  */
 602   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 603     {
 604       gphi *phi = gsi.phi ();
 605       tree access_fn = NULL;
 606       tree def = PHI_RESULT (phi);
 607       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 608
 609       if (dump_enabled_p ())
 610         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
 611                          (gimple *) phi);
 612
 613       /* Skip virtual phi's.  The data dependences that are associated with
 614          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 615       if (virtual_operand_p (def))
 616         continue;
 617
 618       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 619
 620       /* Analyze the evolution function.  */
 621       access_fn = analyze_scalar_evolution (loop, def);
 622       if (access_fn)
 623         {
 624           STRIP_NOPS (access_fn);
 625           if (dump_enabled_p ())
 626             dump_printf_loc (MSG_NOTE, vect_location,
 627                              "Access function of PHI: %T\n", access_fn);
 628           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 629             = initial_condition_in_loop_num (access_fn, loop->num);
 630           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 631             = evolution_part_in_loop_num (access_fn, loop->num);
 632         }
 633
 634       if ((!access_fn
 635            || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
 636            || !vect_is_simple_iv_evolution (loop->num, access_fn,
 637                                             &init, &step)
 638            || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 639                && TREE_CODE (step) != INTEGER_CST))
 640           /* Only handle nonlinear iv for same loop.  */
 641           && (LOOP_VINFO_LOOP (loop_vinfo) != loop
 642               || !vect_is_nonlinear_iv_evolution (loop, stmt_vinfo,
 643                                                   phi, &init, &step)))
 644         {
 645           worklist.safe_push (stmt_vinfo);
 646           continue;
 647         }
 648
 649       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 650                   != NULL_TREE);
 651       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 652
 653       if (dump_enabled_p ())
 654         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 655       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 656     }
 657
 658
 659   /* Second - identify all reductions and nested cycles.  */
 660   while (worklist.length () > 0)
 661     {
 662       stmt_vec_info stmt_vinfo = worklist.pop ();
 663       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
 664       tree def = PHI_RESULT (phi);
 665
 666       if (dump_enabled_p ())
 667         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
 668                          (gimple *) phi);
 669
 670       gcc_assert (!virtual_operand_p (def)
 671                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 672
 673       stmt_vec_info reduc_stmt_info
 674         = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
 675                                     &reduc_chain, slp);
 676       if (reduc_stmt_info)
 677         {
 678           STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
 679           STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
 680           if (double_reduc)
 681             {
 682               if (dump_enabled_p ())
 683                 dump_printf_loc (MSG_NOTE, vect_location,
 684                                  "Detected double reduction.\n");
 685
 686               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 687               STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
 688             }
 689           else
 690             {
 691               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 692                 {
 693                   if (dump_enabled_p ())
 694                     dump_printf_loc (MSG_NOTE, vect_location,
 695                                      "Detected vectorizable nested cycle.\n");
 696
 697                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 698                 }
 699               else
 700                 {
 701                   if (dump_enabled_p ())
 702                     dump_printf_loc (MSG_NOTE, vect_location,
 703                                      "Detected reduction.\n");
 704
 705                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 706                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
 707                   /* Store the reduction cycles for possible vectorization in
 708                      loop-aware SLP if it was not detected as reduction
 709                      chain.  */
 710                   if (! reduc_chain)
 711                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
 712                       (reduc_stmt_info);
 713                 }
 714             }
 715         }
 716       else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi))
 717         STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence;
 718       else
 719         if (dump_enabled_p ())
 720           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 721                            "Unknown def-use cycle pattern.\n");
 722     }
 723 }
 724
 725
 726 /* Function vect_analyze_scalar_cycles.
 727
 728    Examine the cross iteration def-use cycles of scalar variables, by
 729    analyzing the loop-header PHIs of scalar variables.  Classify each
 730    cycle as one of the following: invariant, induction, reduction, unknown.
 731    We do that for the loop represented by LOOP_VINFO, and also to its
 732    inner-loop, if exists.
 733    Examples for scalar cycles:
 734
 735    Example1: reduction:
 736
 737               loop1:
 738               for (i=0; i<N; i++)
 739                  sum += a[i];
 740
 741    Example2: induction:
 742
 743               loop2:
 744               for (i=0; i<N; i++)
 745                  a[i] = i;  */
 746
 747 static void
 748 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo, bool slp)
 749 {
 750   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 751
 752   vect_analyze_scalar_cycles_1 (loop_vinfo, loop, slp);
 753
 754   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 755      Reductions in such inner-loop therefore have different properties than
 756      the reductions in the nest that gets vectorized:
 757      1. When vectorized, they are executed in the same order as in the original
 758         scalar loop, so we can't change the order of computation when
 759         vectorizing them.
 760      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 761         current checks are too strict.  */
 762
 763   if (loop->inner)
 764     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner, slp);
 765 }
 766
 767 /* Transfer group and reduction information from STMT_INFO to its
 768    pattern stmt.  */
 769
 770 static void
 771 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
 772 {
 773   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
 774   stmt_vec_info stmtp;
 775   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
 776               && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
 777   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
 778   do
 779     {
 780       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
 781       gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
 782                            == STMT_VINFO_DEF_TYPE (stmt_info));
 783       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
 784       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
 785       if (stmt_info)
 786         REDUC_GROUP_NEXT_ELEMENT (stmtp)
 787           = STMT_VINFO_RELATED_STMT (stmt_info);
 788     }
 789   while (stmt_info);
 790 }
 791
 792 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 793
 794 static void
 795 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 796 {
 797   stmt_vec_info first;
 798   unsigned i;
 799
 800   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 801     {
 802       stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
 803       while (next)
 804         {
 805           if ((STMT_VINFO_IN_PATTERN_P (next)
 806                != STMT_VINFO_IN_PATTERN_P (first))
 807               || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
 808             break;
 809           next = REDUC_GROUP_NEXT_ELEMENT (next);
 810         }
 811       /* If all reduction chain members are well-formed patterns adjust
 812          the group to group the pattern stmts instead.  */
 813       if (! next
 814           && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
 815         {
 816           if (STMT_VINFO_IN_PATTERN_P (first))
 817             {
 818               vect_fixup_reduc_chain (first);
 819               LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 820                 = STMT_VINFO_RELATED_STMT (first);
 821             }
 822         }
 823       /* If not all stmt in the chain are patterns or if we failed
 824          to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
 825          it as regular reduction instead.  */
 826       else
 827         {
 828           stmt_vec_info vinfo = first;
 829           stmt_vec_info last = NULL;
 830           while (vinfo)
 831             {
 832               next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
 833               REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
 834               REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
 835               last = vinfo;
 836               vinfo = next;
 837             }
 838           STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
 839             = vect_internal_def;
 840           loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
 841           LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
 842           --i;
 843         }
 844     }
 845 }
 846
 847 /* Function vect_get_loop_niters.
 848
 849    Determine how many iterations the loop is executed and place it
 850    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 851    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 852    niter information holds in ASSUMPTIONS.
 853
 854    Return the loop exit condition.  */
 855
 856
 857 static gcond *
 858 vect_get_loop_niters (class loop *loop, tree *assumptions,
 859                       tree *number_of_iterations, tree *number_of_iterationsm1)
 860 {
 861   edge exit = single_exit (loop);
 862   class tree_niter_desc niter_desc;
 863   tree niter_assumptions, niter, may_be_zero;
 864   gcond *cond = get_loop_exit_condition (loop);
 865
 866   *assumptions = boolean_true_node;
 867   *number_of_iterationsm1 = chrec_dont_know;
 868   *number_of_iterations = chrec_dont_know;
 869   DUMP_VECT_SCOPE ("get_loop_niters");
 870
 871   if (!exit)
 872     return cond;
 873
 874   may_be_zero = NULL_TREE;
 875   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 876       || chrec_contains_undetermined (niter_desc.niter))
 877     return cond;
 878
 879   niter_assumptions = niter_desc.assumptions;
 880   may_be_zero = niter_desc.may_be_zero;
 881   niter = niter_desc.niter;
 882
 883   if (may_be_zero && integer_zerop (may_be_zero))
 884     may_be_zero = NULL_TREE;
 885
 886   if (may_be_zero)
 887     {
 888       if (COMPARISON_CLASS_P (may_be_zero))
 889         {
 890           /* Try to combine may_be_zero with assumptions, this can simplify
 891              computation of niter expression.  */
 892           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 893             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 894                                              niter_assumptions,
 895                                              fold_build1 (TRUTH_NOT_EXPR,
 896                                                           boolean_type_node,
 897                                                           may_be_zero));
 898           else
 899             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 900                                  build_int_cst (TREE_TYPE (niter), 0),
 901                                  rewrite_to_non_trapping_overflow (niter));
 902
 903           may_be_zero = NULL_TREE;
 904         }
 905       else if (integer_nonzerop (may_be_zero))
 906         {
 907           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 908           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 909           return cond;
 910         }
 911       else
 912         return cond;
 913     }
 914
 915   *assumptions = niter_assumptions;
 916   *number_of_iterationsm1 = niter;
 917
 918   /* We want the number of loop header executions which is the number
 919      of latch executions plus one.
 920      ???  For UINT_MAX latch executions this number overflows to zero
 921      for loops like do { n++; } while (n != 0);  */
 922   if (niter && !chrec_contains_undetermined (niter))
 923     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
 924                           build_int_cst (TREE_TYPE (niter), 1));
 925   *number_of_iterations = niter;
 926
 927   return cond;
 928 }
 929
 930 /* Function bb_in_loop_p
 931
 932    Used as predicate for dfs order traversal of the loop bbs.  */
 933
 934 static bool
 935 bb_in_loop_p (const_basic_block bb, const void *data)
 936 {
 937   const class loop *const loop = (const class loop *)data;
 938   if (flow_bb_inside_loop_p (loop, bb))
 939     return true;
 940   return false;
 941 }
 942
 943
 944 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
 945    stmt_vec_info structs for all the stmts in LOOP_IN.  */
 946
 947 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
 948   : vec_info (vec_info::loop, shared),
 949     loop (loop_in),
 950     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
 951     num_itersm1 (NULL_TREE),
 952     num_iters (NULL_TREE),
 953     num_iters_unchanged (NULL_TREE),
 954     num_iters_assumptions (NULL_TREE),
 955     vector_costs (nullptr),
 956     scalar_costs (nullptr),
 957     th (0),
 958     versioning_threshold (0),
 959     vectorization_factor (0),
 960     main_loop_edge (nullptr),
 961     skip_main_loop_edge (nullptr),
 962     skip_this_loop_edge (nullptr),
 963     reusable_accumulators (),
 964     suggested_unroll_factor (1),
 965     max_vectorization_factor (0),
 966     mask_skip_niters (NULL_TREE),
 967     rgroup_compare_type (NULL_TREE),
 968     simd_if_cond (NULL_TREE),
 969     partial_vector_style (vect_partial_vectors_none),
 970     unaligned_dr (NULL),
 971     peeling_for_alignment (0),
 972     ptr_mask (0),
 973     ivexpr_map (NULL),
 974     scan_map (NULL),
 975     slp_unrolling_factor (1),
 976     inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
 977     vectorizable (false),
 978     can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
 979     using_partial_vectors_p (false),
 980     using_decrementing_iv_p (false),
 981     using_select_vl_p (false),
 982     epil_using_partial_vectors_p (false),
 983     partial_load_store_bias (0),
 984     peeling_for_gaps (false),
 985     peeling_for_niter (false),
 986     no_data_dependencies (false),
 987     has_mask_store (false),
 988     scalar_loop_scaling (profile_probability::uninitialized ()),
 989     scalar_loop (NULL),
 990     orig_loop_info (NULL)
 991 {
 992   /* CHECKME: We want to visit all BBs before their successors (except for
 993      latch blocks, for which this assertion wouldn't hold).  In the simple
 994      case of the loop forms we allow, a dfs order of the BBs would the same
 995      as reversed postorder traversal, so we are safe.  */
 996
 997   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 998                                           bbs, loop->num_nodes, loop);
 999   gcc_assert (nbbs == loop->num_nodes);
1000
1001   for (unsigned int i = 0; i < nbbs; i++)
1002     {
1003       basic_block bb = bbs[i];
1004       gimple_stmt_iterator si;
1005
1006       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1007         {
1008           gimple *phi = gsi_stmt (si);
1009           gimple_set_uid (phi, 0);
1010           add_stmt (phi);
1011         }
1012
1013       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1014         {
1015           gimple *stmt = gsi_stmt (si);
1016           gimple_set_uid (stmt, 0);
1017           if (is_gimple_debug (stmt))
1018             continue;
1019           add_stmt (stmt);
1020           /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
1021              third argument is the #pragma omp simd if (x) condition, when 0,
1022              loop shouldn't be vectorized, when non-zero constant, it should
1023              be vectorized normally, otherwise versioned with vectorized loop
1024              done if the condition is non-zero at runtime.  */
1025           if (loop_in->simduid
1026               && is_gimple_call (stmt)
1027               && gimple_call_internal_p (stmt)
1028               && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
1029               && gimple_call_num_args (stmt) >= 3
1030               && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
1031               && (loop_in->simduid
1032                   == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
1033             {
1034               tree arg = gimple_call_arg (stmt, 2);
1035               if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
1036                 simd_if_cond = arg;
1037               else
1038                 gcc_assert (integer_nonzerop (arg));
1039             }
1040         }
1041     }
1042
1043   epilogue_vinfos.create (6);
1044 }
1045
1046 /* Free all levels of rgroup CONTROLS.  */
1047
1048 void
1049 release_vec_loop_controls (vec<rgroup_controls> *controls)
1050 {
1051   rgroup_controls *rgc;
1052   unsigned int i;
1053   FOR_EACH_VEC_ELT (*controls, i, rgc)
1054     rgc->controls.release ();
1055   controls->release ();
1056 }
1057
1058 /* Free all memory used by the _loop_vec_info, as well as all the
1059    stmt_vec_info structs of all the stmts in the loop.  */
1060
1061 _loop_vec_info::~_loop_vec_info ()
1062 {
1063   free (bbs);
1064
1065   release_vec_loop_controls (&masks.rgc_vec);
1066   release_vec_loop_controls (&lens);
1067   delete ivexpr_map;
1068   delete scan_map;
1069   epilogue_vinfos.release ();
1070   delete scalar_costs;
1071   delete vector_costs;
1072
1073   /* When we release an epiloge vinfo that we do not intend to use
1074      avoid clearing AUX of the main loop which should continue to
1075      point to the main loop vinfo since otherwise we'll leak that.  */
1076   if (loop->aux == this)
1077     loop->aux = NULL;
1078 }
1079
1080 /* Return an invariant or register for EXPR and emit necessary
1081    computations in the LOOP_VINFO loop preheader.  */
1082
1083 tree
1084 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
1085 {
1086   if (is_gimple_reg (expr)
1087       || is_gimple_min_invariant (expr))
1088     return expr;
1089
1090   if (! loop_vinfo->ivexpr_map)
1091     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
1092   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
1093   if (! cached)
1094     {
1095       gimple_seq stmts = NULL;
1096       cached = force_gimple_operand (unshare_expr (expr),
1097                                      &stmts, true, NULL_TREE);
1098       if (stmts)
1099         {
1100           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1101           gsi_insert_seq_on_edge_immediate (e, stmts);
1102         }
1103     }
1104   return cached;
1105 }
1106
1107 /* Return true if we can use CMP_TYPE as the comparison type to produce
1108    all masks required to mask LOOP_VINFO.  */
1109
1110 static bool
1111 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1112 {
1113   rgroup_controls *rgm;
1114   unsigned int i;
1115   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1116     if (rgm->type != NULL_TREE
1117         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1118                                             cmp_type, rgm->type,
1119                                             OPTIMIZE_FOR_SPEED))
1120       return false;
1121   return true;
1122 }
1123
1124 /* Calculate the maximum number of scalars per iteration for every
1125    rgroup in LOOP_VINFO.  */
1126
1127 static unsigned int
1128 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1129 {
1130   unsigned int res = 1;
1131   unsigned int i;
1132   rgroup_controls *rgm;
1133   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1134     res = MAX (res, rgm->max_nscalars_per_iter);
1135   return res;
1136 }
1137
1138 /* Calculate the minimum precision necessary to represent:
1139
1140       MAX_NITERS * FACTOR
1141
1142    as an unsigned integer, where MAX_NITERS is the maximum number of
1143    loop header iterations for the original scalar form of LOOP_VINFO.  */
1144
1145 static unsigned
1146 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1147 {
1148   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1149
1150   /* Get the maximum number of iterations that is representable
1151      in the counter type.  */
1152   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1153   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1154
1155   /* Get a more refined estimate for the number of iterations.  */
1156   widest_int max_back_edges;
1157   if (max_loop_iterations (loop, &max_back_edges))
1158     max_ni = wi::smin (max_ni, max_back_edges + 1);
1159
1160   /* Work out how many bits we need to represent the limit.  */
1161   return wi::min_precision (max_ni * factor, UNSIGNED);
1162 }
1163
1164 /* True if the loop needs peeling or partial vectors when vectorized.  */
1165
1166 static bool
1167 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1168 {
1169   unsigned HOST_WIDE_INT const_vf;
1170   HOST_WIDE_INT max_niter
1171     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1172
1173   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1174   if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1175     th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1176                                           (loop_vinfo));
1177
1178   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1179       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1180     {
1181       /* Work out the (constant) number of iterations that need to be
1182          peeled for reasons other than niters.  */
1183       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1184       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1185         peel_niter += 1;
1186       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1187                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1188         return true;
1189     }
1190   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1191       /* ??? When peeling for gaps but not alignment, we could
1192          try to check whether the (variable) niters is known to be
1193          VF * N + 1.  That's something of a niche case though.  */
1194       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1195       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1196       || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1197            < (unsigned) exact_log2 (const_vf))
1198           /* In case of versioning, check if the maximum number of
1199              iterations is greater than th.  If they are identical,
1200              the epilogue is unnecessary.  */
1201           && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1202               || ((unsigned HOST_WIDE_INT) max_niter
1203                   > (th / const_vf) * const_vf))))
1204     return true;
1205
1206   return false;
1207 }
1208
1209 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1210    whether we can actually generate the masks required.  Return true if so,
1211    storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE.  */
1212
1213 static bool
1214 vect_verify_full_masking (loop_vec_info loop_vinfo)
1215 {
1216   unsigned int min_ni_width;
1217
1218   /* Use a normal loop if there are no statements that need masking.
1219      This only happens in rare degenerate cases: it means that the loop
1220      has no loads, no stores, and no live-out values.  */
1221   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1222     return false;
1223
1224   /* Produce the rgroup controls.  */
1225   for (auto mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1226     {
1227       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1228       tree vectype = mask.first;
1229       unsigned nvectors = mask.second;
1230
1231       if (masks->rgc_vec.length () < nvectors)
1232         masks->rgc_vec.safe_grow_cleared (nvectors, true);
1233       rgroup_controls *rgm = &(*masks).rgc_vec[nvectors - 1];
1234       /* The number of scalars per iteration and the number of vectors are
1235          both compile-time constants.  */
1236       unsigned int nscalars_per_iter
1237           = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1238                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1239
1240       if (rgm->max_nscalars_per_iter < nscalars_per_iter)
1241         {
1242           rgm->max_nscalars_per_iter = nscalars_per_iter;
1243           rgm->type = truth_type_for (vectype);
1244           rgm->factor = 1;
1245         }
1246     }
1247
1248   unsigned int max_nscalars_per_iter
1249     = vect_get_max_nscalars_per_iter (loop_vinfo);
1250
1251   /* Work out how many bits we need to represent the limit.  */
1252   min_ni_width
1253     = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1254
1255   /* Find a scalar mode for which WHILE_ULT is supported.  */
1256   opt_scalar_int_mode cmp_mode_iter;
1257   tree cmp_type = NULL_TREE;
1258   tree iv_type = NULL_TREE;
1259   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1260   unsigned int iv_precision = UINT_MAX;
1261
1262   if (iv_limit != -1)
1263     iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1264                                       UNSIGNED);
1265
1266   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1267     {
1268       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1269       if (cmp_bits >= min_ni_width
1270           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1271         {
1272           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1273           if (this_type
1274               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1275             {
1276               /* Although we could stop as soon as we find a valid mode,
1277                  there are at least two reasons why that's not always the
1278                  best choice:
1279
1280                  - An IV that's Pmode or wider is more likely to be reusable
1281                    in address calculations than an IV that's narrower than
1282                    Pmode.
1283
1284                  - Doing the comparison in IV_PRECISION or wider allows
1285                    a natural 0-based IV, whereas using a narrower comparison
1286                    type requires mitigations against wrap-around.
1287
1288                  Conversely, if the IV limit is variable, doing the comparison
1289                  in a wider type than the original type can introduce
1290                  unnecessary extensions, so picking the widest valid mode
1291                  is not always a good choice either.
1292
1293                  Here we prefer the first IV type that's Pmode or wider,
1294                  and the first comparison type that's IV_PRECISION or wider.
1295                  (The comparison type must be no wider than the IV type,
1296                  to avoid extensions in the vector loop.)
1297
1298                  ??? We might want to try continuing beyond Pmode for ILP32
1299                  targets if CMP_BITS < IV_PRECISION.  */
1300               iv_type = this_type;
1301               if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1302                 cmp_type = this_type;
1303               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1304                 break;
1305             }
1306         }
1307     }
1308
1309   if (!cmp_type)
1310     {
1311       LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.release ();
1312       return false;
1313     }
1314
1315   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1316   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1317   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_while_ult;
1318   return true;
1319 }
1320
1321 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1322    whether we can actually generate AVX512 style masks.  Return true if so,
1323    storing the type of the scalar IV in LOOP_VINFO_RGROUP_IV_TYPE.  */
1324
1325 static bool
1326 vect_verify_full_masking_avx512 (loop_vec_info loop_vinfo)
1327 {
1328   /* Produce differently organized rgc_vec and differently check
1329      we can produce masks.  */
1330
1331   /* Use a normal loop if there are no statements that need masking.
1332      This only happens in rare degenerate cases: it means that the loop
1333      has no loads, no stores, and no live-out values.  */
1334   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1335     return false;
1336
1337   /* For the decrementing IV we need to represent all values in
1338      [0, niter + niter_skip] where niter_skip is the elements we
1339      skip in the first iteration for prologue peeling.  */
1340   tree iv_type = NULL_TREE;
1341   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1342   unsigned int iv_precision = UINT_MAX;
1343   if (iv_limit != -1)
1344     iv_precision = wi::min_precision (iv_limit, UNSIGNED);
1345
1346   /* First compute the type for the IV we use to track the remaining
1347      scalar iterations.  */
1348   opt_scalar_int_mode cmp_mode_iter;
1349   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1350     {
1351       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1352       if (cmp_bits >= iv_precision
1353           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1354         {
1355           iv_type = build_nonstandard_integer_type (cmp_bits, true);
1356           if (iv_type)
1357             break;
1358         }
1359     }
1360   if (!iv_type)
1361     return false;
1362
1363   /* Produce the rgroup controls.  */
1364   for (auto const &mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1365     {
1366       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1367       tree vectype = mask.first;
1368       unsigned nvectors = mask.second;
1369
1370       /* The number of scalars per iteration and the number of vectors are
1371          both compile-time constants.  */
1372       unsigned int nscalars_per_iter
1373         = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1374                      LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1375
1376       /* We index the rgroup_controls vector with nscalars_per_iter
1377          which we keep constant and instead have a varying nvectors,
1378          remembering the vector mask with the fewest nV.  */
1379       if (masks->rgc_vec.length () < nscalars_per_iter)
1380         masks->rgc_vec.safe_grow_cleared (nscalars_per_iter, true);
1381       rgroup_controls *rgm = &(*masks).rgc_vec[nscalars_per_iter - 1];
1382
1383       if (!rgm->type || rgm->factor > nvectors)
1384         {
1385           rgm->type = truth_type_for (vectype);
1386           rgm->compare_type = NULL_TREE;
1387           rgm->max_nscalars_per_iter = nscalars_per_iter;
1388           rgm->factor = nvectors;
1389           rgm->bias_adjusted_ctrl = NULL_TREE;
1390         }
1391     }
1392
1393   /* There is no fixed compare type we are going to use but we have to
1394      be able to get at one for each mask group.  */
1395   unsigned int min_ni_width
1396     = wi::min_precision (vect_max_vf (loop_vinfo), UNSIGNED);
1397
1398   bool ok = true;
1399   for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
1400     {
1401       tree mask_type = rgc.type;
1402       if (!mask_type)
1403         continue;
1404
1405       if (TYPE_PRECISION (TREE_TYPE (mask_type)) != 1)
1406         {
1407           ok = false;
1408           break;
1409         }
1410
1411       /* If iv_type is usable as compare type use that - we can elide the
1412          saturation in that case.   */
1413       if (TYPE_PRECISION (iv_type) >= min_ni_width)
1414         {
1415           tree cmp_vectype
1416             = build_vector_type (iv_type, TYPE_VECTOR_SUBPARTS (mask_type));
1417           if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1418             rgc.compare_type = cmp_vectype;
1419         }
1420       if (!rgc.compare_type)
1421         FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1422           {
1423             unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1424             if (cmp_bits >= min_ni_width
1425                 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1426               {
1427                 tree cmp_type = build_nonstandard_integer_type (cmp_bits, true);
1428                 if (!cmp_type)
1429                   continue;
1430
1431                 /* Check whether we can produce the mask with cmp_type.  */
1432                 tree cmp_vectype
1433                   = build_vector_type (cmp_type, TYPE_VECTOR_SUBPARTS (mask_type));
1434                 if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1435                   {
1436                     rgc.compare_type = cmp_vectype;
1437                     break;
1438                   }
1439               }
1440         }
1441       if (!rgc.compare_type)
1442         {
1443           ok = false;
1444           break;
1445         }
1446     }
1447   if (!ok)
1448     {
1449       release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
1450       return false;
1451     }
1452
1453   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = error_mark_node;
1454   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1455   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_avx512;
1456   return true;
1457 }
1458
1459 /* Check whether we can use vector access with length based on precison
1460    comparison.  So far, to keep it simple, we only allow the case that the
1461    precision of the target supported length is larger than the precision
1462    required by loop niters.  */
1463
1464 static bool
1465 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1466 {
1467   if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1468     return false;
1469
1470   machine_mode len_load_mode, len_store_mode;
1471   if (!get_len_load_store_mode (loop_vinfo->vector_mode, true)
1472          .exists (&len_load_mode))
1473     return false;
1474   if (!get_len_load_store_mode (loop_vinfo->vector_mode, false)
1475          .exists (&len_store_mode))
1476     return false;
1477
1478   signed char partial_load_bias = internal_len_load_store_bias
1479     (IFN_LEN_LOAD, len_load_mode);
1480
1481   signed char partial_store_bias = internal_len_load_store_bias
1482     (IFN_LEN_STORE, len_store_mode);
1483
1484   gcc_assert (partial_load_bias == partial_store_bias);
1485
1486   if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1487     return false;
1488
1489   /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1490      len_loads with a length of zero.  In order to avoid that we prohibit
1491      more than one loop length here.  */
1492   if (partial_load_bias == -1
1493       && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1494     return false;
1495
1496   LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1497
1498   unsigned int max_nitems_per_iter = 1;
1499   unsigned int i;
1500   rgroup_controls *rgl;
1501   /* Find the maximum number of items per iteration for every rgroup.  */
1502   FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1503     {
1504       unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1505       max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1506     }
1507
1508   /* Work out how many bits we need to represent the length limit.  */
1509   unsigned int min_ni_prec
1510     = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1511
1512   /* Now use the maximum of below precisions for one suitable IV type:
1513      - the IV's natural precision
1514      - the precision needed to hold: the maximum number of scalar
1515        iterations multiplied by the scale factor (min_ni_prec above)
1516      - the Pmode precision
1517
1518      If min_ni_prec is less than the precision of the current niters,
1519      we perfer to still use the niters type.  Prefer to use Pmode and
1520      wider IV to avoid narrow conversions.  */
1521
1522   unsigned int ni_prec
1523     = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1524   min_ni_prec = MAX (min_ni_prec, ni_prec);
1525   min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1526
1527   tree iv_type = NULL_TREE;
1528   opt_scalar_int_mode tmode_iter;
1529   FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1530     {
1531       scalar_mode tmode = tmode_iter.require ();
1532       unsigned int tbits = GET_MODE_BITSIZE (tmode);
1533
1534       /* ??? Do we really want to construct one IV whose precision exceeds
1535          BITS_PER_WORD?  */
1536       if (tbits > BITS_PER_WORD)
1537         break;
1538
1539       /* Find the first available standard integral type.  */
1540       if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1541         {
1542           iv_type = build_nonstandard_integer_type (tbits, true);
1543           break;
1544         }
1545     }
1546
1547   if (!iv_type)
1548     {
1549       if (dump_enabled_p ())
1550         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1551                          "can't vectorize with length-based partial vectors"
1552                          " because there is no suitable iv type.\n");
1553       return false;
1554     }
1555
1556   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1557   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1558   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_len;
1559
1560   return true;
1561 }
1562
1563 /* Calculate the cost of one scalar iteration of the loop.  */
1564 static void
1565 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1566 {
1567   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1568   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1569   int nbbs = loop->num_nodes, factor;
1570   int innerloop_iters, i;
1571
1572   DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1573
1574   /* Gather costs for statements in the scalar loop.  */
1575
1576   /* FORNOW.  */
1577   innerloop_iters = 1;
1578   if (loop->inner)
1579     innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1580
1581   for (i = 0; i < nbbs; i++)
1582     {
1583       gimple_stmt_iterator si;
1584       basic_block bb = bbs[i];
1585
1586       if (bb->loop_father == loop->inner)
1587         factor = innerloop_iters;
1588       else
1589         factor = 1;
1590
1591       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1592         {
1593           gimple *stmt = gsi_stmt (si);
1594           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1595
1596           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1597             continue;
1598
1599           /* Skip stmts that are not vectorized inside the loop.  */
1600           stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1601           if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1602               && (!STMT_VINFO_LIVE_P (vstmt_info)
1603                   || !VECTORIZABLE_CYCLE_DEF
1604                         (STMT_VINFO_DEF_TYPE (vstmt_info))))
1605             continue;
1606
1607           vect_cost_for_stmt kind;
1608           if (STMT_VINFO_DATA_REF (stmt_info))
1609             {
1610               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1611                kind = scalar_load;
1612              else
1613                kind = scalar_store;
1614             }
1615           else if (vect_nop_conversion_p (stmt_info))
1616             continue;
1617           else
1618             kind = scalar_stmt;
1619
1620           /* We are using vect_prologue here to avoid scaling twice
1621              by the inner loop factor.  */
1622           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1623                             factor, kind, stmt_info, 0, vect_prologue);
1624         }
1625     }
1626
1627   /* Now accumulate cost.  */
1628   loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1629   add_stmt_costs (loop_vinfo->scalar_costs,
1630                   &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1631   loop_vinfo->scalar_costs->finish_cost (nullptr);
1632 }
1633
1634
1635 /* Function vect_analyze_loop_form.
1636
1637    Verify that certain CFG restrictions hold, including:
1638    - the loop has a pre-header
1639    - the loop has a single entry and exit
1640    - the loop exit condition is simple enough
1641    - the number of iterations can be analyzed, i.e, a countable loop.  The
1642      niter could be analyzed under some assumptions.  */
1643
1644 opt_result
1645 vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info)
1646 {
1647   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1648
1649   /* Different restrictions apply when we are considering an inner-most loop,
1650      vs. an outer (nested) loop.
1651      (FORNOW. May want to relax some of these restrictions in the future).  */
1652
1653   info->inner_loop_cond = NULL;
1654   if (!loop->inner)
1655     {
1656       /* Inner-most loop.  We currently require that the number of BBs is
1657          exactly 2 (the header and latch).  Vectorizable inner-most loops
1658          look like this:
1659
1660                         (pre-header)
1661                            |
1662                           header <--------+
1663                            | |            |
1664                            | +--> latch --+
1665                            |
1666                         (exit-bb)  */
1667
1668       if (loop->num_nodes != 2)
1669         return opt_result::failure_at (vect_location,
1670                                        "not vectorized:"
1671                                        " control flow in loop.\n");
1672
1673       if (empty_block_p (loop->header))
1674         return opt_result::failure_at (vect_location,
1675                                        "not vectorized: empty loop.\n");
1676     }
1677   else
1678     {
1679       class loop *innerloop = loop->inner;
1680       edge entryedge;
1681
1682       /* Nested loop. We currently require that the loop is doubly-nested,
1683          contains a single inner loop, and the number of BBs is exactly 5.
1684          Vectorizable outer-loops look like this:
1685
1686                         (pre-header)
1687                            |
1688                           header <---+
1689                            |         |
1690                           inner-loop |
1691                            |         |
1692                           tail ------+
1693                            |
1694                         (exit-bb)
1695
1696          The inner-loop has the properties expected of inner-most loops
1697          as described above.  */
1698
1699       if ((loop->inner)->inner || (loop->inner)->next)
1700         return opt_result::failure_at (vect_location,
1701                                        "not vectorized:"
1702                                        " multiple nested loops.\n");
1703
1704       if (loop->num_nodes != 5)
1705         return opt_result::failure_at (vect_location,
1706                                        "not vectorized:"
1707                                        " control flow in loop.\n");
1708
1709       entryedge = loop_preheader_edge (innerloop);
1710       if (entryedge->src != loop->header
1711           || !single_exit (innerloop)
1712           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1713         return opt_result::failure_at (vect_location,
1714                                        "not vectorized:"
1715                                        " unsupported outerloop form.\n");
1716
1717       /* Analyze the inner-loop.  */
1718       vect_loop_form_info inner;
1719       opt_result res = vect_analyze_loop_form (loop->inner, &inner);
1720       if (!res)
1721         {
1722           if (dump_enabled_p ())
1723             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1724                              "not vectorized: Bad inner loop.\n");
1725           return res;
1726         }
1727
1728       /* Don't support analyzing niter under assumptions for inner
1729          loop.  */
1730       if (!integer_onep (inner.assumptions))
1731         return opt_result::failure_at (vect_location,
1732                                        "not vectorized: Bad inner loop.\n");
1733
1734       if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1735         return opt_result::failure_at (vect_location,
1736                                        "not vectorized: inner-loop count not"
1737                                        " invariant.\n");
1738
1739       if (dump_enabled_p ())
1740         dump_printf_loc (MSG_NOTE, vect_location,
1741                          "Considering outer-loop vectorization.\n");
1742       info->inner_loop_cond = inner.loop_cond;
1743     }
1744
1745   if (!single_exit (loop))
1746     return opt_result::failure_at (vect_location,
1747                                    "not vectorized: multiple exits.\n");
1748   if (EDGE_COUNT (loop->header->preds) != 2)
1749     return opt_result::failure_at (vect_location,
1750                                    "not vectorized:"
1751                                    " too many incoming edges.\n");
1752
1753   /* We assume that the loop exit condition is at the end of the loop. i.e,
1754      that the loop is represented as a do-while (with a proper if-guard
1755      before the loop if needed), where the loop header contains all the
1756      executable statements, and the latch is empty.  */
1757   if (!empty_block_p (loop->latch)
1758       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1759     return opt_result::failure_at (vect_location,
1760                                    "not vectorized: latch block not empty.\n");
1761
1762   /* Make sure the exit is not abnormal.  */
1763   edge e = single_exit (loop);
1764   if (e->flags & EDGE_ABNORMAL)
1765     return opt_result::failure_at (vect_location,
1766                                    "not vectorized:"
1767                                    " abnormal loop exit edge.\n");
1768
1769   info->loop_cond
1770     = vect_get_loop_niters (loop, &info->assumptions,
1771                             &info->number_of_iterations,
1772                             &info->number_of_iterationsm1);
1773   if (!info->loop_cond)
1774     return opt_result::failure_at
1775       (vect_location,
1776        "not vectorized: complicated exit condition.\n");
1777
1778   if (integer_zerop (info->assumptions)
1779       || !info->number_of_iterations
1780       || chrec_contains_undetermined (info->number_of_iterations))
1781     return opt_result::failure_at
1782       (info->loop_cond,
1783        "not vectorized: number of iterations cannot be computed.\n");
1784
1785   if (integer_zerop (info->number_of_iterations))
1786     return opt_result::failure_at
1787       (info->loop_cond,
1788        "not vectorized: number of iterations = 0.\n");
1789
1790   if (!(tree_fits_shwi_p (info->number_of_iterations)
1791         && tree_to_shwi (info->number_of_iterations) > 0))
1792     {
1793       if (dump_enabled_p ())
1794         {
1795           dump_printf_loc (MSG_NOTE, vect_location,
1796                            "Symbolic number of iterations is ");
1797           dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1798           dump_printf (MSG_NOTE, "\n");
1799         }
1800     }
1801
1802   return opt_result::success ();
1803 }
1804
1805 /* Create a loop_vec_info for LOOP with SHARED and the
1806    vect_analyze_loop_form result.  */
1807
1808 loop_vec_info
1809 vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1810                         const vect_loop_form_info *info,
1811                         loop_vec_info main_loop_info)
1812 {
1813   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1814   LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1815   LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1816   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1817   LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = main_loop_info;
1818   /* Also record the assumptions for versioning.  */
1819   if (!integer_onep (info->assumptions) && !main_loop_info)
1820     LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1821
1822   stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (info->loop_cond);
1823   STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1824   if (info->inner_loop_cond)
1825     {
1826       stmt_vec_info inner_loop_cond_info
1827         = loop_vinfo->lookup_stmt (info->inner_loop_cond);
1828       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1829       /* If we have an estimate on the number of iterations of the inner
1830          loop use that to limit the scale for costing, otherwise use
1831          --param vect-inner-loop-cost-factor literally.  */
1832       widest_int nit;
1833       if (estimated_stmt_executions (loop->inner, &nit))
1834         LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1835           = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1836     }
1837
1838   return loop_vinfo;
1839 }
1840
1841
1842
1843 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1844    statements update the vectorization factor.  */
1845
1846 static void
1847 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1848 {
1849   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1850   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1851   int nbbs = loop->num_nodes;
1852   poly_uint64 vectorization_factor;
1853   int i;
1854
1855   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1856
1857   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1858   gcc_assert (known_ne (vectorization_factor, 0U));
1859
1860   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1861      vectorization factor of the loop is the unrolling factor required by
1862      the SLP instances.  If that unrolling factor is 1, we say, that we
1863      perform pure SLP on loop - cross iteration parallelism is not
1864      exploited.  */
1865   bool only_slp_in_loop = true;
1866   for (i = 0; i < nbbs; i++)
1867     {
1868       basic_block bb = bbs[i];
1869       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1870            gsi_next (&si))
1871         {
1872           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1873           if (!stmt_info)
1874             continue;
1875           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1876                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1877               && !PURE_SLP_STMT (stmt_info))
1878             /* STMT needs both SLP and loop-based vectorization.  */
1879             only_slp_in_loop = false;
1880         }
1881       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1882            gsi_next (&si))
1883         {
1884           if (is_gimple_debug (gsi_stmt (si)))
1885             continue;
1886           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1887           stmt_info = vect_stmt_to_vectorize (stmt_info);
1888           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1889                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1890               && !PURE_SLP_STMT (stmt_info))
1891             /* STMT needs both SLP and loop-based vectorization.  */
1892             only_slp_in_loop = false;
1893         }
1894     }
1895
1896   if (only_slp_in_loop)
1897     {
1898       if (dump_enabled_p ())
1899         dump_printf_loc (MSG_NOTE, vect_location,
1900                          "Loop contains only SLP stmts\n");
1901       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1902     }
1903   else
1904     {
1905       if (dump_enabled_p ())
1906         dump_printf_loc (MSG_NOTE, vect_location,
1907                          "Loop contains SLP and non-SLP stmts\n");
1908       /* Both the vectorization factor and unroll factor have the form
1909          GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1910          so they must have a common multiple.  */
1911       vectorization_factor
1912         = force_common_multiple (vectorization_factor,
1913                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1914     }
1915
1916   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1917   if (dump_enabled_p ())
1918     {
1919       dump_printf_loc (MSG_NOTE, vect_location,
1920                        "Updating vectorization factor to ");
1921       dump_dec (MSG_NOTE, vectorization_factor);
1922       dump_printf (MSG_NOTE, ".\n");
1923     }
1924 }
1925
1926 /* Return true if STMT_INFO describes a double reduction phi and if
1927    the other phi in the reduction is also relevant for vectorization.
1928    This rejects cases such as:
1929
1930       outer1:
1931         x_1 = PHI <x_3(outer2), ...>;
1932         ...
1933
1934       inner:
1935         x_2 = ...;
1936         ...
1937
1938       outer2:
1939         x_3 = PHI <x_2(inner)>;
1940
1941    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1942
1943 static bool
1944 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1945 {
1946   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1947     return false;
1948
1949   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1950 }
1951
1952 /* Function vect_analyze_loop_operations.
1953
1954    Scan the loop stmts and make sure they are all vectorizable.  */
1955
1956 static opt_result
1957 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1958 {
1959   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1960   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1961   int nbbs = loop->num_nodes;
1962   int i;
1963   stmt_vec_info stmt_info;
1964   bool need_to_vectorize = false;
1965   bool ok;
1966
1967   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1968
1969   auto_vec<stmt_info_for_cost> cost_vec;
1970
1971   for (i = 0; i < nbbs; i++)
1972     {
1973       basic_block bb = bbs[i];
1974
1975       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1976            gsi_next (&si))
1977         {
1978           gphi *phi = si.phi ();
1979           ok = true;
1980
1981           stmt_info = loop_vinfo->lookup_stmt (phi);
1982           if (dump_enabled_p ())
1983             dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G",
1984                              (gimple *) phi);
1985           if (virtual_operand_p (gimple_phi_result (phi)))
1986             continue;
1987
1988           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1989              (i.e., a phi in the tail of the outer-loop).  */
1990           if (! is_loop_header_bb_p (bb))
1991             {
1992               /* FORNOW: we currently don't support the case that these phis
1993                  are not used in the outerloop (unless it is double reduction,
1994                  i.e., this phi is vect_reduction_def), cause this case
1995                  requires to actually do something here.  */
1996               if (STMT_VINFO_LIVE_P (stmt_info)
1997                   && !vect_active_double_reduction_p (stmt_info))
1998                 return opt_result::failure_at (phi,
1999                                                "Unsupported loop-closed phi"
2000                                                " in outer-loop.\n");
2001
2002               /* If PHI is used in the outer loop, we check that its operand
2003                  is defined in the inner loop.  */
2004               if (STMT_VINFO_RELEVANT_P (stmt_info))
2005                 {
2006                   tree phi_op;
2007
2008                   if (gimple_phi_num_args (phi) != 1)
2009                     return opt_result::failure_at (phi, "unsupported phi");
2010
2011                   phi_op = PHI_ARG_DEF (phi, 0);
2012                   stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
2013                   if (!op_def_info)
2014                     return opt_result::failure_at (phi, "unsupported phi\n");
2015
2016                   if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
2017                       && (STMT_VINFO_RELEVANT (op_def_info)
2018                           != vect_used_in_outer_by_reduction))
2019                     return opt_result::failure_at (phi, "unsupported phi\n");
2020
2021                   if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2022                        || (STMT_VINFO_DEF_TYPE (stmt_info)
2023                            == vect_double_reduction_def))
2024                       && !vectorizable_lc_phi (loop_vinfo,
2025                                                stmt_info, NULL, NULL))
2026                     return opt_result::failure_at (phi, "unsupported phi\n");
2027                 }
2028
2029               continue;
2030             }
2031
2032           gcc_assert (stmt_info);
2033
2034           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
2035                || STMT_VINFO_LIVE_P (stmt_info))
2036               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def
2037               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
2038             /* A scalar-dependence cycle that we don't support.  */
2039             return opt_result::failure_at (phi,
2040                                            "not vectorized:"
2041                                            " scalar dependence cycle.\n");
2042
2043           if (STMT_VINFO_RELEVANT_P (stmt_info))
2044             {
2045               need_to_vectorize = true;
2046               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
2047                   && ! PURE_SLP_STMT (stmt_info))
2048                 ok = vectorizable_induction (loop_vinfo,
2049                                              stmt_info, NULL, NULL,
2050                                              &cost_vec);
2051               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2052                         || (STMT_VINFO_DEF_TYPE (stmt_info)
2053                             == vect_double_reduction_def)
2054                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
2055                        && ! PURE_SLP_STMT (stmt_info))
2056                 ok = vectorizable_reduction (loop_vinfo,
2057                                              stmt_info, NULL, NULL, &cost_vec);
2058               else if ((STMT_VINFO_DEF_TYPE (stmt_info)
2059                         == vect_first_order_recurrence)
2060                        && ! PURE_SLP_STMT (stmt_info))
2061                 ok = vectorizable_recurr (loop_vinfo, stmt_info, NULL, NULL,
2062                                            &cost_vec);
2063             }
2064
2065           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
2066           if (ok
2067               && STMT_VINFO_LIVE_P (stmt_info)
2068               && !PURE_SLP_STMT (stmt_info))
2069             ok = vectorizable_live_operation (loop_vinfo, stmt_info, NULL, NULL,
2070                                               -1, false, &cost_vec);
2071
2072           if (!ok)
2073             return opt_result::failure_at (phi,
2074                                            "not vectorized: relevant phi not "
2075                                            "supported: %G",
2076                                            static_cast <gimple *> (phi));
2077         }
2078
2079       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
2080            gsi_next (&si))
2081         {
2082           gimple *stmt = gsi_stmt (si);
2083           if (!gimple_clobber_p (stmt)
2084               && !is_gimple_debug (stmt))
2085             {
2086               opt_result res
2087                 = vect_analyze_stmt (loop_vinfo,
2088                                      loop_vinfo->lookup_stmt (stmt),
2089                                      &need_to_vectorize,
2090                                      NULL, NULL, &cost_vec);
2091               if (!res)
2092                 return res;
2093             }
2094         }
2095     } /* bbs */
2096
2097   add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
2098
2099   /* All operations in the loop are either irrelevant (deal with loop
2100      control, or dead), or only used outside the loop and can be moved
2101      out of the loop (e.g. invariants, inductions).  The loop can be
2102      optimized away by scalar optimizations.  We're better off not
2103      touching this loop.  */
2104   if (!need_to_vectorize)
2105     {
2106       if (dump_enabled_p ())
2107         dump_printf_loc (MSG_NOTE, vect_location,
2108                          "All the computation can be taken out of the loop.\n");
2109       return opt_result::failure_at
2110         (vect_location,
2111          "not vectorized: redundant loop. no profit to vectorize.\n");
2112     }
2113
2114   return opt_result::success ();
2115 }
2116
2117 /* Return true if we know that the iteration count is smaller than the
2118    vectorization factor.  Return false if it isn't, or if we can't be sure
2119    either way.  */
2120
2121 static bool
2122 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
2123 {
2124   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2125
2126   HOST_WIDE_INT max_niter;
2127   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2128     max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
2129   else
2130     max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2131
2132   if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
2133     return true;
2134
2135   return false;
2136 }
2137
2138 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
2139    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
2140    definitely no, or -1 if it's worth retrying.  */
2141
2142 static int
2143 vect_analyze_loop_costing (loop_vec_info loop_vinfo,
2144                            unsigned *suggested_unroll_factor)
2145 {
2146   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2147   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2148
2149   /* Only loops that can handle partially-populated vectors can have iteration
2150      counts less than the vectorization factor.  */
2151   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2152       && vect_known_niters_smaller_than_vf (loop_vinfo))
2153     {
2154       if (dump_enabled_p ())
2155         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2156                          "not vectorized: iteration count smaller than "
2157                          "vectorization factor.\n");
2158       return 0;
2159     }
2160
2161   /* If we know the number of iterations we can do better, for the
2162      epilogue we can also decide whether the main loop leaves us
2163      with enough iterations, prefering a smaller vector epilog then
2164      also possibly used for the case we skip the vector loop.  */
2165   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2166     {
2167       widest_int scalar_niters
2168         = wi::to_widest (LOOP_VINFO_NITERSM1 (loop_vinfo)) + 1;
2169       if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2170         {
2171           loop_vec_info orig_loop_vinfo
2172             = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2173           unsigned lowest_vf
2174             = constant_lower_bound (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo));
2175           int prolog_peeling = 0;
2176           if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2177             prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo);
2178           if (prolog_peeling >= 0
2179               && known_eq (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
2180                            lowest_vf))
2181             {
2182               unsigned gap
2183                 = LOOP_VINFO_PEELING_FOR_GAPS (orig_loop_vinfo) ? 1 : 0;
2184               scalar_niters = ((scalar_niters - gap - prolog_peeling)
2185                                % lowest_vf + gap);
2186             }
2187         }
2188       /* Reject vectorizing for a single scalar iteration, even if
2189          we could in principle implement that using partial vectors.  */
2190       unsigned peeling_gap = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2191       if (scalar_niters <= peeling_gap + 1)
2192         {
2193           if (dump_enabled_p ())
2194             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2195                              "not vectorized: loop only has a single "
2196                              "scalar iteration.\n");
2197           return 0;
2198         }
2199
2200       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2201         {
2202           /* Check that the loop processes at least one full vector.  */
2203           poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2204           if (known_lt (scalar_niters, vf))
2205             {
2206               if (dump_enabled_p ())
2207                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2208                                  "loop does not have enough iterations "
2209                                  "to support vectorization.\n");
2210               return 0;
2211             }
2212
2213           /* If we need to peel an extra epilogue iteration to handle data
2214              accesses with gaps, check that there are enough scalar iterations
2215              available.
2216
2217              The check above is redundant with this one when peeling for gaps,
2218              but the distinction is useful for diagnostics.  */
2219           if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2220               && known_le (scalar_niters, vf))
2221             {
2222               if (dump_enabled_p ())
2223                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2224                                  "loop does not have enough iterations "
2225                                  "to support peeling for gaps.\n");
2226               return 0;
2227             }
2228         }
2229     }
2230
2231   /* If using the "very cheap" model. reject cases in which we'd keep
2232      a copy of the scalar code (even if we might be able to vectorize it).  */
2233   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2234       && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2235           || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2236           || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
2237     {
2238       if (dump_enabled_p ())
2239         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2240                          "some scalar iterations would need to be peeled\n");
2241       return 0;
2242     }
2243
2244   int min_profitable_iters, min_profitable_estimate;
2245   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2246                                       &min_profitable_estimate,
2247                                       suggested_unroll_factor);
2248
2249   if (min_profitable_iters < 0)
2250     {
2251       if (dump_enabled_p ())
2252         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2253                          "not vectorized: vectorization not profitable.\n");
2254       if (dump_enabled_p ())
2255         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2256                          "not vectorized: vector version will never be "
2257                          "profitable.\n");
2258       return -1;
2259     }
2260
2261   int min_scalar_loop_bound = (param_min_vect_loop_bound
2262                                * assumed_vf);
2263
2264   /* Use the cost model only if it is more conservative than user specified
2265      threshold.  */
2266   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
2267                                     min_profitable_iters);
2268
2269   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2270
2271   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2272       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2273     {
2274       if (dump_enabled_p ())
2275         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2276                          "not vectorized: vectorization not profitable.\n");
2277       if (dump_enabled_p ())
2278         dump_printf_loc (MSG_NOTE, vect_location,
2279                          "not vectorized: iteration count smaller than user "
2280                          "specified loop bound parameter or minimum profitable "
2281                          "iterations (whichever is more conservative).\n");
2282       return 0;
2283     }
2284
2285   /* The static profitablity threshold min_profitable_estimate includes
2286      the cost of having to check at runtime whether the scalar loop
2287      should be used instead.  If it turns out that we don't need or want
2288      such a check, the threshold we should use for the static estimate
2289      is simply the point at which the vector loop becomes more profitable
2290      than the scalar loop.  */
2291   if (min_profitable_estimate > min_profitable_iters
2292       && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
2293       && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2294       && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2295       && !vect_apply_runtime_profitability_check_p (loop_vinfo))
2296     {
2297       if (dump_enabled_p ())
2298         dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
2299                          " choice between the scalar and vector loops\n");
2300       min_profitable_estimate = min_profitable_iters;
2301     }
2302
2303   /* If the vector loop needs multiple iterations to be beneficial then
2304      things are probably too close to call, and the conservative thing
2305      would be to stick with the scalar code.  */
2306   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2307       && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
2308     {
2309       if (dump_enabled_p ())
2310         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2311                          "one iteration of the vector loop would be"
2312                          " more expensive than the equivalent number of"
2313                          " iterations of the scalar loop\n");
2314       return 0;
2315     }
2316
2317   HOST_WIDE_INT estimated_niter;
2318
2319   /* If we are vectorizing an epilogue then we know the maximum number of
2320      scalar iterations it will cover is at least one lower than the
2321      vectorization factor of the main loop.  */
2322   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2323     estimated_niter
2324       = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
2325   else
2326     {
2327       estimated_niter = estimated_stmt_executions_int (loop);
2328       if (estimated_niter == -1)
2329         estimated_niter = likely_max_stmt_executions_int (loop);
2330     }
2331   if (estimated_niter != -1
2332       && ((unsigned HOST_WIDE_INT) estimated_niter
2333           < MAX (th, (unsigned) min_profitable_estimate)))
2334     {
2335       if (dump_enabled_p ())
2336         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2337                          "not vectorized: estimated iteration count too "
2338                          "small.\n");
2339       if (dump_enabled_p ())
2340         dump_printf_loc (MSG_NOTE, vect_location,
2341                          "not vectorized: estimated iteration count smaller "
2342                          "than specified loop bound parameter or minimum "
2343                          "profitable iterations (whichever is more "
2344                          "conservative).\n");
2345       return -1;
2346     }
2347
2348   return 1;
2349 }
2350
2351 static opt_result
2352 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
2353                            vec<data_reference_p> *datarefs,
2354                            unsigned int *n_stmts)
2355 {
2356   *n_stmts = 0;
2357   for (unsigned i = 0; i < loop->num_nodes; i++)
2358     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2359          !gsi_end_p (gsi); gsi_next (&gsi))
2360       {
2361         gimple *stmt = gsi_stmt (gsi);
2362         if (is_gimple_debug (stmt))
2363           continue;
2364         ++(*n_stmts);
2365         opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
2366                                                         NULL, 0);
2367         if (!res)
2368           {
2369             if (is_gimple_call (stmt) && loop->safelen)
2370               {
2371                 tree fndecl = gimple_call_fndecl (stmt), op;
2372                 if (fndecl == NULL_TREE
2373                     && gimple_call_internal_p (stmt, IFN_MASK_CALL))
2374                   {
2375                     fndecl = gimple_call_arg (stmt, 0);
2376                     gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
2377                     fndecl = TREE_OPERAND (fndecl, 0);
2378                     gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
2379                   }
2380                 if (fndecl != NULL_TREE)
2381                   {
2382                     cgraph_node *node = cgraph_node::get (fndecl);
2383                     if (node != NULL && node->simd_clones != NULL)
2384                       {
2385                         unsigned int j, n = gimple_call_num_args (stmt);
2386                         for (j = 0; j < n; j++)
2387                           {
2388                             op = gimple_call_arg (stmt, j);
2389                             if (DECL_P (op)
2390                                 || (REFERENCE_CLASS_P (op)
2391                                     && get_base_address (op)))
2392                               break;
2393                           }
2394                         op = gimple_call_lhs (stmt);
2395                         /* Ignore #pragma omp declare simd functions
2396                            if they don't have data references in the
2397                            call stmt itself.  */
2398                         if (j == n
2399                             && !(op
2400                                  && (DECL_P (op)
2401                                      || (REFERENCE_CLASS_P (op)
2402                                          && get_base_address (op)))))
2403                           continue;
2404                       }
2405                   }
2406               }
2407             return res;
2408           }
2409         /* If dependence analysis will give up due to the limit on the
2410            number of datarefs stop here and fail fatally.  */
2411         if (datarefs->length ()
2412             > (unsigned)param_loop_max_datarefs_for_datadeps)
2413           return opt_result::failure_at (stmt, "exceeded param "
2414                                          "loop-max-datarefs-for-datadeps\n");
2415       }
2416   return opt_result::success ();
2417 }
2418
2419 /* Look for SLP-only access groups and turn each individual access into its own
2420    group.  */
2421 static void
2422 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2423 {
2424   unsigned int i;
2425   struct data_reference *dr;
2426
2427   DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2428
2429   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2430   FOR_EACH_VEC_ELT (datarefs, i, dr)
2431     {
2432       gcc_assert (DR_REF (dr));
2433       stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
2434
2435       /* Check if the load is a part of an interleaving chain.  */
2436       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2437         {
2438           stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2439           dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2440           unsigned int group_size = DR_GROUP_SIZE (first_element);
2441
2442           /* Check if SLP-only groups.  */
2443           if (!STMT_SLP_TYPE (stmt_info)
2444               && STMT_VINFO_SLP_VECT_ONLY (first_element))
2445             {
2446               /* Dissolve the group.  */
2447               STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2448
2449               stmt_vec_info vinfo = first_element;
2450               while (vinfo)
2451                 {
2452                   stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2453                   DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2454                   DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2455                   DR_GROUP_SIZE (vinfo) = 1;
2456                   if (STMT_VINFO_STRIDED_P (first_element)
2457                       /* We cannot handle stores with gaps.  */
2458                       || DR_IS_WRITE (dr_info->dr))
2459                     {
2460                       STMT_VINFO_STRIDED_P (vinfo) = true;
2461                       DR_GROUP_GAP (vinfo) = 0;
2462                     }
2463                   else
2464                     DR_GROUP_GAP (vinfo) = group_size - 1;
2465                   /* Duplicate and adjust alignment info, it needs to
2466                      be present on each group leader, see dr_misalignment.  */
2467                   if (vinfo != first_element)
2468                     {
2469                       dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2470                       dr_info2->target_alignment = dr_info->target_alignment;
2471                       int misalignment = dr_info->misalignment;
2472                       if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2473                         {
2474                           HOST_WIDE_INT diff
2475                             = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2476                                - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2477                           unsigned HOST_WIDE_INT align_c
2478                             = dr_info->target_alignment.to_constant ();
2479                           misalignment = (misalignment + diff) % align_c;
2480                         }
2481                       dr_info2->misalignment = misalignment;
2482                     }
2483                   vinfo = next;
2484                 }
2485             }
2486         }
2487     }
2488 }
2489
2490 /* Determine if operating on full vectors for LOOP_VINFO might leave
2491    some scalar iterations still to do.  If so, decide how we should
2492    handle those scalar iterations.  The possibilities are:
2493
2494    (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2495        In this case:
2496
2497          LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2498          LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2499          LOOP_VINFO_PEELING_FOR_NITER == false
2500
2501    (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2502        to handle the remaining scalar iterations.  In this case:
2503
2504          LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2505          LOOP_VINFO_PEELING_FOR_NITER == true
2506
2507        There are two choices:
2508
2509        (2a) Consider vectorizing the epilogue loop at the same VF as the
2510             main loop, but using partial vectors instead of full vectors.
2511             In this case:
2512
2513               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2514
2515        (2b) Consider vectorizing the epilogue loop at lower VFs only.
2516             In this case:
2517
2518               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2519  */
2520
2521 opt_result
2522 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo)
2523 {
2524   /* Determine whether there would be any scalar iterations left over.  */
2525   bool need_peeling_or_partial_vectors_p
2526     = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2527
2528   /* Decide whether to vectorize the loop with partial vectors.  */
2529   LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2530   LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2531   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2532       && need_peeling_or_partial_vectors_p)
2533     {
2534       /* For partial-vector-usage=1, try to push the handling of partial
2535          vectors to the epilogue, with the main loop continuing to operate
2536          on full vectors.
2537
2538          If we are unrolling we also do not want to use partial vectors. This
2539          is to avoid the overhead of generating multiple masks and also to
2540          avoid having to execute entire iterations of FALSE masked instructions
2541          when dealing with one or less full iterations.
2542
2543          ??? We could then end up failing to use partial vectors if we
2544          decide to peel iterations into a prologue, and if the main loop
2545          then ends up processing fewer than VF iterations.  */
2546       if ((param_vect_partial_vector_usage == 1
2547            || loop_vinfo->suggested_unroll_factor > 1)
2548           && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2549           && !vect_known_niters_smaller_than_vf (loop_vinfo))
2550         LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2551       else
2552         LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2553     }
2554
2555   if (dump_enabled_p ())
2556     dump_printf_loc (MSG_NOTE, vect_location,
2557                      "operating on %s vectors%s.\n",
2558                      LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2559                      ? "partial" : "full",
2560                      LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2561                      ? " for epilogue loop" : "");
2562
2563   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2564     = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2565        && need_peeling_or_partial_vectors_p);
2566
2567   return opt_result::success ();
2568 }
2569
2570 /* Function vect_analyze_loop_2.
2571
2572    Apply a set of analyses on LOOP specified by LOOP_VINFO, the different
2573    analyses will record information in some members of LOOP_VINFO.  FATAL
2574    indicates if some analysis meets fatal error.  If one non-NULL pointer
2575    SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one
2576    worked out suggested unroll factor, while one NULL pointer shows it's
2577    going to apply the suggested unroll factor.  SLP_DONE_FOR_SUGGESTED_UF
2578    is to hold the slp decision when the suggested unroll factor is worked
2579    out.  */
2580 static opt_result
2581 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
2582                      unsigned *suggested_unroll_factor,
2583                      bool& slp_done_for_suggested_uf)
2584 {
2585   opt_result ok = opt_result::success ();
2586   int res;
2587   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2588   poly_uint64 min_vf = 2;
2589   loop_vec_info orig_loop_vinfo = NULL;
2590
2591   /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2592      loop_vec_info of the first vectorized loop.  */
2593   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2594     orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2595   else
2596     orig_loop_vinfo = loop_vinfo;
2597   gcc_assert (orig_loop_vinfo);
2598
2599   /* The first group of checks is independent of the vector size.  */
2600   fatal = true;
2601
2602   if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2603       && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2604     return opt_result::failure_at (vect_location,
2605                                    "not vectorized: simd if(0)\n");
2606
2607   /* Find all data references in the loop (which correspond to vdefs/vuses)
2608      and analyze their evolution in the loop.  */
2609
2610   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2611
2612   /* Gather the data references and count stmts in the loop.  */
2613   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2614     {
2615       opt_result res
2616         = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2617                                      &LOOP_VINFO_DATAREFS (loop_vinfo),
2618                                      &LOOP_VINFO_N_STMTS (loop_vinfo));
2619       if (!res)
2620         {
2621           if (dump_enabled_p ())
2622             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2623                              "not vectorized: loop contains function "
2624                              "calls or data references that cannot "
2625                              "be analyzed\n");
2626           return res;
2627         }
2628       loop_vinfo->shared->save_datarefs ();
2629     }
2630   else
2631     loop_vinfo->shared->check_datarefs ();
2632
2633   /* Analyze the data references and also adjust the minimal
2634      vectorization factor according to the loads and stores.  */
2635
2636   ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2637   if (!ok)
2638     {
2639       if (dump_enabled_p ())
2640         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2641                          "bad data references.\n");
2642       return ok;
2643     }
2644
2645   /* Check if we are applying unroll factor now.  */
2646   bool applying_suggested_uf = loop_vinfo->suggested_unroll_factor > 1;
2647   gcc_assert (!applying_suggested_uf || !suggested_unroll_factor);
2648
2649   /* If the slp decision is false when suggested unroll factor is worked
2650      out, and we are applying suggested unroll factor, we can simply skip
2651      all slp related analyses this time.  */
2652   bool slp = !applying_suggested_uf || slp_done_for_suggested_uf;
2653
2654   /* Classify all cross-iteration scalar data-flow cycles.
2655      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
2656   vect_analyze_scalar_cycles (loop_vinfo, slp);
2657
2658   vect_pattern_recog (loop_vinfo);
2659
2660   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2661
2662   /* Analyze the access patterns of the data-refs in the loop (consecutive,
2663      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
2664
2665   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2666   if (!ok)
2667     {
2668       if (dump_enabled_p ())
2669         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2670                          "bad data access.\n");
2671       return ok;
2672     }
2673
2674   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
2675
2676   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2677   if (!ok)
2678     {
2679       if (dump_enabled_p ())
2680         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2681                          "unexpected pattern.\n");
2682       return ok;
2683     }
2684
2685   /* While the rest of the analysis below depends on it in some way.  */
2686   fatal = false;
2687
2688   /* Analyze data dependences between the data-refs in the loop
2689      and adjust the maximum vectorization factor according to
2690      the dependences.
2691      FORNOW: fail at the first data dependence that we encounter.  */
2692
2693   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2694   if (!ok)
2695     {
2696       if (dump_enabled_p ())
2697         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2698                          "bad data dependence.\n");
2699       return ok;
2700     }
2701   if (max_vf != MAX_VECTORIZATION_FACTOR
2702       && maybe_lt (max_vf, min_vf))
2703     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2704   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2705
2706   ok = vect_determine_vectorization_factor (loop_vinfo);
2707   if (!ok)
2708     {
2709       if (dump_enabled_p ())
2710         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2711                          "can't determine vectorization factor.\n");
2712       return ok;
2713     }
2714   if (max_vf != MAX_VECTORIZATION_FACTOR
2715       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2716     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2717
2718   /* Compute the scalar iteration cost.  */
2719   vect_compute_single_scalar_iteration_cost (loop_vinfo);
2720
2721   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2722
2723   if (slp)
2724     {
2725       /* Check the SLP opportunities in the loop, analyze and build
2726          SLP trees.  */
2727       ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo));
2728       if (!ok)
2729         return ok;
2730
2731       /* If there are any SLP instances mark them as pure_slp.  */
2732       slp = vect_make_slp_decision (loop_vinfo);
2733       if (slp)
2734         {
2735           /* Find stmts that need to be both vectorized and SLPed.  */
2736           vect_detect_hybrid_slp (loop_vinfo);
2737
2738           /* Update the vectorization factor based on the SLP decision.  */
2739           vect_update_vf_for_slp (loop_vinfo);
2740
2741           /* Optimize the SLP graph with the vectorization factor fixed.  */
2742           vect_optimize_slp (loop_vinfo);
2743
2744           /* Gather the loads reachable from the SLP graph entries.  */
2745           vect_gather_slp_loads (loop_vinfo);
2746         }
2747     }
2748
2749   bool saved_can_use_partial_vectors_p
2750     = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2751
2752   /* We don't expect to have to roll back to anything other than an empty
2753      set of rgroups.  */
2754   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2755
2756   /* This is the point where we can re-start analysis with SLP forced off.  */
2757 start_over:
2758
2759   /* Apply the suggested unrolling factor, this was determined by the backend
2760      during finish_cost the first time we ran the analyzis for this
2761      vector mode.  */
2762   if (applying_suggested_uf)
2763     LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2764
2765   /* Now the vectorization factor is final.  */
2766   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2767   gcc_assert (known_ne (vectorization_factor, 0U));
2768
2769   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2770     {
2771       dump_printf_loc (MSG_NOTE, vect_location,
2772                        "vectorization_factor = ");
2773       dump_dec (MSG_NOTE, vectorization_factor);
2774       dump_printf (MSG_NOTE, ", niters = %wd\n",
2775                    LOOP_VINFO_INT_NITERS (loop_vinfo));
2776     }
2777
2778   loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2779
2780   /* Analyze the alignment of the data-refs in the loop.
2781      Fail if a data reference is found that cannot be vectorized.  */
2782
2783   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2784   if (!ok)
2785     {
2786       if (dump_enabled_p ())
2787         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2788                          "bad data alignment.\n");
2789       return ok;
2790     }
2791
2792   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2793      It is important to call pruning after vect_analyze_data_ref_accesses,
2794      since we use grouping information gathered by interleaving analysis.  */
2795   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2796   if (!ok)
2797     return ok;
2798
2799   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2800      vectorization, since we do not want to add extra peeling or
2801      add versioning for alignment.  */
2802   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2803     /* This pass will decide on using loop versioning and/or loop peeling in
2804        order to enhance the alignment of data references in the loop.  */
2805     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2806   if (!ok)
2807     return ok;
2808
2809   if (slp)
2810     {
2811       /* Analyze operations in the SLP instances.  Note this may
2812          remove unsupported SLP instances which makes the above
2813          SLP kind detection invalid.  */
2814       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2815       vect_slp_analyze_operations (loop_vinfo);
2816       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2817         {
2818           ok = opt_result::failure_at (vect_location,
2819                                        "unsupported SLP instances\n");
2820           goto again;
2821         }
2822
2823       /* Check whether any load in ALL SLP instances is possibly permuted.  */
2824       slp_tree load_node, slp_root;
2825       unsigned i, x;
2826       slp_instance instance;
2827       bool can_use_lanes = true;
2828       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2829         {
2830           slp_root = SLP_INSTANCE_TREE (instance);
2831           int group_size = SLP_TREE_LANES (slp_root);
2832           tree vectype = SLP_TREE_VECTYPE (slp_root);
2833           bool loads_permuted = false;
2834           FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2835             {
2836               if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2837                 continue;
2838               unsigned j;
2839               stmt_vec_info load_info;
2840               FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2841                 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2842                   {
2843                     loads_permuted = true;
2844                     break;
2845                   }
2846             }
2847
2848           /* If the loads and stores can be handled with load/store-lane
2849              instructions record it and move on to the next instance.  */
2850           if (loads_permuted
2851               && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2852               && vect_store_lanes_supported (vectype, group_size, false)
2853                    != IFN_LAST)
2854             {
2855               FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2856                 {
2857                   stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2858                       (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2859                   /* Use SLP for strided accesses (or if we can't
2860                      load-lanes).  */
2861                   if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2862                       || vect_load_lanes_supported
2863                             (STMT_VINFO_VECTYPE (stmt_vinfo),
2864                              DR_GROUP_SIZE (stmt_vinfo), false) == IFN_LAST)
2865                     break;
2866                 }
2867
2868               can_use_lanes
2869                 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
2870
2871               if (can_use_lanes && dump_enabled_p ())
2872                 dump_printf_loc (MSG_NOTE, vect_location,
2873                                  "SLP instance %p can use load/store-lanes\n",
2874                                  (void *) instance);
2875             }
2876           else
2877             {
2878               can_use_lanes = false;
2879               break;
2880             }
2881         }
2882
2883       /* If all SLP instances can use load/store-lanes abort SLP and try again
2884          with SLP disabled.  */
2885       if (can_use_lanes)
2886         {
2887           ok = opt_result::failure_at (vect_location,
2888                                        "Built SLP cancelled: can use "
2889                                        "load/store-lanes\n");
2890           if (dump_enabled_p ())
2891             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2892                              "Built SLP cancelled: all SLP instances support "
2893                              "load/store-lanes\n");
2894           goto again;
2895         }
2896     }
2897
2898   /* Dissolve SLP-only groups.  */
2899   vect_dissolve_slp_only_groups (loop_vinfo);
2900
2901   /* Scan all the remaining operations in the loop that are not subject
2902      to SLP and make sure they are vectorizable.  */
2903   ok = vect_analyze_loop_operations (loop_vinfo);
2904   if (!ok)
2905     {
2906       if (dump_enabled_p ())
2907         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2908                          "bad operation or unsupported loop bound.\n");
2909       return ok;
2910     }
2911
2912   /* For now, we don't expect to mix both masking and length approaches for one
2913      loop, disable it if both are recorded.  */
2914   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2915       && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2916       && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2917     {
2918       if (dump_enabled_p ())
2919         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2920                          "can't vectorize a loop with partial vectors"
2921                          " because we don't expect to mix different"
2922                          " approaches with partial vectors for the"
2923                          " same loop.\n");
2924       LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2925     }
2926
2927   /* If we still have the option of using partial vectors,
2928      check whether we can generate the necessary loop controls.  */
2929   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
2930     {
2931       if (!LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
2932         {
2933           if (!vect_verify_full_masking (loop_vinfo)
2934               && !vect_verify_full_masking_avx512 (loop_vinfo))
2935             LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2936         }
2937       else /* !LOOP_VINFO_LENS (loop_vinfo).is_empty () */
2938         if (!vect_verify_loop_lens (loop_vinfo))
2939           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2940     }
2941
2942   /* If we're vectorizing a loop that uses length "controls" and
2943      can iterate more than once, we apply decrementing IV approach
2944      in loop control.  */
2945   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2946       && LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_len
2947       && LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0
2948       && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2949            && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
2950                         LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
2951     LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
2952
2953   /* If a loop uses length controls and has a decrementing loop control IV,
2954      we will normally pass that IV through a MIN_EXPR to calcaluate the
2955      basis for the length controls.  E.g. in a loop that processes one
2956      element per scalar iteration, the number of elements would be
2957      MIN_EXPR <N, VF>, where N is the number of scalar iterations left.
2958
2959      This MIN_EXPR approach allows us to use pointer IVs with an invariant
2960      step, since only the final iteration of the vector loop can have
2961      inactive lanes.
2962
2963      However, some targets have a dedicated instruction for calculating the
2964      preferred length, given the total number of elements that still need to
2965      be processed.  This is encapsulated in the SELECT_VL internal function.
2966
2967      If the target supports SELECT_VL, we can use it instead of MIN_EXPR
2968      to determine the basis for the length controls.  However, unlike the
2969      MIN_EXPR calculation, the SELECT_VL calculation can decide to make
2970      lanes inactive in any iteration of the vector loop, not just the last
2971      iteration.  This SELECT_VL approach therefore requires us to use pointer
2972      IVs with variable steps.
2973
2974      Once we've decided how many elements should be processed by one
2975      iteration of the vector loop, we need to populate the rgroup controls.
2976      If a loop has multiple rgroups, we need to make sure that those rgroups
2977      "line up" (that is, they must be consistent about which elements are
2978      active and which aren't).  This is done by vect_adjust_loop_lens_control.
2979
2980      In principle, it would be possible to use vect_adjust_loop_lens_control
2981      on either the result of a MIN_EXPR or the result of a SELECT_VL.
2982      However:
2983
2984      (1) In practice, it only makes sense to use SELECT_VL when a vector
2985          operation will be controlled directly by the result.  It is not
2986          worth using SELECT_VL if it would only be the input to other
2987          calculations.
2988
2989      (2) If we use SELECT_VL for an rgroup that has N controls, each associated
2990          pointer IV will need N updates by a variable amount (N-1 updates
2991          within the iteration and 1 update to move to the next iteration).
2992
2993      Because of this, we prefer to use the MIN_EXPR approach whenever there
2994      is more than one length control.
2995
2996      In addition, SELECT_VL always operates to a granularity of 1 unit.
2997      If we wanted to use it to control an SLP operation on N consecutive
2998      elements, we would need to make the SELECT_VL inputs measure scalar
2999      iterations (rather than elements) and then multiply the SELECT_VL
3000      result by N.  But using SELECT_VL this way is inefficient because
3001      of (1) above.
3002
3003      2. We don't apply SELECT_VL on single-rgroup when both (1) and (2) are
3004         satisfied:
3005
3006      (1). LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) is true.
3007      (2). LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () is true.
3008
3009      Since SELECT_VL (variable step) will make SCEV analysis failed and then
3010      we will fail to gain benefits of following unroll optimizations. We prefer
3011      using the MIN_EXPR approach in this situation.  */
3012   if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
3013     {
3014       tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
3015       if (direct_internal_fn_supported_p (IFN_SELECT_VL, iv_type,
3016                                           OPTIMIZE_FOR_SPEED)
3017           && LOOP_VINFO_LENS (loop_vinfo).length () == 1
3018           && LOOP_VINFO_LENS (loop_vinfo)[0].factor == 1 && !slp
3019           && (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3020               || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()))
3021         LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true;
3022     }
3023
3024   /* Decide whether this loop_vinfo should use partial vectors or peeling,
3025      assuming that the loop will be used as a main loop.  We will redo
3026      this analysis later if we instead decide to use the loop as an
3027      epilogue loop.  */
3028   ok = vect_determine_partial_vectors_and_peeling (loop_vinfo);
3029   if (!ok)
3030     return ok;
3031
3032   /* If we're vectorizing an epilogue loop, the vectorized loop either needs
3033      to be able to handle fewer than VF scalars, or needs to have a lower VF
3034      than the main loop.  */
3035   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
3036       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3037     {
3038       poly_uint64 unscaled_vf
3039         = exact_div (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
3040                      orig_loop_vinfo->suggested_unroll_factor);
3041       if (maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo), unscaled_vf))
3042         return opt_result::failure_at (vect_location,
3043                                        "Vectorization factor too high for"
3044                                        " epilogue loop.\n");
3045     }
3046
3047   /* Check the costings of the loop make vectorizing worthwhile.  */
3048   res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
3049   if (res < 0)
3050     {
3051       ok = opt_result::failure_at (vect_location,
3052                                    "Loop costings may not be worthwhile.\n");
3053       goto again;
3054     }
3055   if (!res)
3056     return opt_result::failure_at (vect_location,
3057                                    "Loop costings not worthwhile.\n");
3058
3059   /* If an epilogue loop is required make sure we can create one.  */
3060   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
3061       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
3062     {
3063       if (dump_enabled_p ())
3064         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
3065       if (!vect_can_advance_ivs_p (loop_vinfo)
3066           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
3067                                            single_exit (LOOP_VINFO_LOOP
3068                                                          (loop_vinfo))))
3069         {
3070           ok = opt_result::failure_at (vect_location,
3071                                        "not vectorized: can't create required "
3072                                        "epilog loop\n");
3073           goto again;
3074         }
3075     }
3076
3077   /* During peeling, we need to check if number of loop iterations is
3078      enough for both peeled prolog loop and vector loop.  This check
3079      can be merged along with threshold check of loop versioning, so
3080      increase threshold for this case if necessary.
3081
3082      If we are analyzing an epilogue we still want to check what its
3083      versioning threshold would be.  If we decide to vectorize the epilogues we
3084      will want to use the lowest versioning threshold of all epilogues and main
3085      loop.  This will enable us to enter a vectorized epilogue even when
3086      versioning the loop.  We can't simply check whether the epilogue requires
3087      versioning though since we may have skipped some versioning checks when
3088      analyzing the epilogue.  For instance, checks for alias versioning will be
3089      skipped when dealing with epilogues as we assume we already checked them
3090      for the main loop.  So instead we always check the 'orig_loop_vinfo'.  */
3091   if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
3092     {
3093       poly_uint64 niters_th = 0;
3094       unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3095
3096       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
3097         {
3098           /* Niters for peeled prolog loop.  */
3099           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3100             {
3101               dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
3102               tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
3103               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
3104             }
3105           else
3106             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3107         }
3108
3109       /* Niters for at least one iteration of vectorized loop.  */
3110       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3111         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3112       /* One additional iteration because of peeling for gap.  */
3113       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3114         niters_th += 1;
3115
3116       /*  Use the same condition as vect_transform_loop to decide when to use
3117           the cost to determine a versioning threshold.  */
3118       if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3119           && ordered_p (th, niters_th))
3120         niters_th = ordered_max (poly_uint64 (th), niters_th);
3121
3122       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
3123     }
3124
3125   gcc_assert (known_eq (vectorization_factor,
3126                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
3127
3128   slp_done_for_suggested_uf = slp;
3129
3130   /* Ok to vectorize!  */
3131   LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
3132   return opt_result::success ();
3133
3134 again:
3135   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
3136   gcc_assert (!ok);
3137
3138   /* Try again with SLP forced off but if we didn't do any SLP there is
3139      no point in re-trying.  */
3140   if (!slp)
3141     return ok;
3142
3143   /* If the slp decision is true when suggested unroll factor is worked
3144      out, and we are applying suggested unroll factor, we don't need to
3145      re-try any more.  */
3146   if (applying_suggested_uf && slp_done_for_suggested_uf)
3147     return ok;
3148
3149   /* If there are reduction chains re-trying will fail anyway.  */
3150   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
3151     return ok;
3152
3153   /* Likewise if the grouped loads or stores in the SLP cannot be handled
3154      via interleaving or lane instructions.  */
3155   slp_instance instance;
3156   slp_tree node;
3157   unsigned i, j;
3158   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
3159     {
3160       stmt_vec_info vinfo;
3161       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
3162       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
3163         continue;
3164       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3165       unsigned int size = DR_GROUP_SIZE (vinfo);
3166       tree vectype = STMT_VINFO_VECTYPE (vinfo);
3167       if (vect_store_lanes_supported (vectype, size, false) == IFN_LAST
3168          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
3169          && ! vect_grouped_store_supported (vectype, size))
3170         return opt_result::failure_at (vinfo->stmt,
3171                                        "unsupported grouped store\n");
3172       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
3173         {
3174           vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
3175           vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3176           bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
3177           size = DR_GROUP_SIZE (vinfo);
3178           vectype = STMT_VINFO_VECTYPE (vinfo);
3179           if (vect_load_lanes_supported (vectype, size, false) == IFN_LAST
3180               && ! vect_grouped_load_supported (vectype, single_element_p,
3181                                                 size))
3182             return opt_result::failure_at (vinfo->stmt,
3183                                            "unsupported grouped load\n");
3184         }
3185     }
3186
3187   if (dump_enabled_p ())
3188     dump_printf_loc (MSG_NOTE, vect_location,
3189                      "re-trying with SLP disabled\n");
3190
3191   /* Roll back state appropriately.  No SLP this time.  */
3192   slp = false;
3193   /* Restore vectorization factor as it were without SLP.  */
3194   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
3195   /* Free the SLP instances.  */
3196   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
3197     vect_free_slp_instance (instance);
3198   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
3199   /* Reset SLP type to loop_vect on all stmts.  */
3200   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
3201     {
3202       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3203       for (gimple_stmt_iterator si = gsi_start_phis (bb);
3204            !gsi_end_p (si); gsi_next (&si))
3205         {
3206           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3207           STMT_SLP_TYPE (stmt_info) = loop_vect;
3208           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
3209               || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
3210             {
3211               /* vectorizable_reduction adjusts reduction stmt def-types,
3212                  restore them to that of the PHI.  */
3213               STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
3214                 = STMT_VINFO_DEF_TYPE (stmt_info);
3215               STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
3216                                         (STMT_VINFO_REDUC_DEF (stmt_info)))
3217                 = STMT_VINFO_DEF_TYPE (stmt_info);
3218             }
3219         }
3220       for (gimple_stmt_iterator si = gsi_start_bb (bb);
3221            !gsi_end_p (si); gsi_next (&si))
3222         {
3223           if (is_gimple_debug (gsi_stmt (si)))
3224             continue;
3225           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3226           STMT_SLP_TYPE (stmt_info) = loop_vect;
3227           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
3228             {
3229               stmt_vec_info pattern_stmt_info
3230                 = STMT_VINFO_RELATED_STMT (stmt_info);
3231               if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
3232                 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
3233
3234               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
3235               STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
3236               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
3237                    !gsi_end_p (pi); gsi_next (&pi))
3238                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
3239                   = loop_vect;
3240             }
3241         }
3242     }
3243   /* Free optimized alias test DDRS.  */
3244   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
3245   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
3246   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
3247   /* Reset target cost data.  */
3248   delete loop_vinfo->vector_costs;
3249   loop_vinfo->vector_costs = nullptr;
3250   /* Reset accumulated rgroup information.  */
3251   LOOP_VINFO_MASKS (loop_vinfo).mask_set.empty ();
3252   release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
3253   release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
3254   /* Reset assorted flags.  */
3255   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
3256   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
3257   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
3258   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
3259   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3260     = saved_can_use_partial_vectors_p;
3261
3262   goto start_over;
3263 }
3264
3265 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
3266    to be better than vectorizing it using OLD_LOOP_VINFO.  Assume that
3267    OLD_LOOP_VINFO is better unless something specifically indicates
3268    otherwise.
3269
3270    Note that this deliberately isn't a partial order.  */
3271
3272 static bool
3273 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
3274                           loop_vec_info old_loop_vinfo)
3275 {
3276   struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
3277   gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
3278
3279   poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
3280   poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
3281
3282   /* Always prefer a VF of loop->simdlen over any other VF.  */
3283   if (loop->simdlen)
3284     {
3285       bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
3286       bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
3287       if (new_simdlen_p != old_simdlen_p)
3288         return new_simdlen_p;
3289     }
3290
3291   const auto *old_costs = old_loop_vinfo->vector_costs;
3292   const auto *new_costs = new_loop_vinfo->vector_costs;
3293   if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
3294     return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
3295
3296   return new_costs->better_main_loop_than_p (old_costs);
3297 }
3298
3299 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO.  Return
3300    true if we should.  */
3301
3302 static bool
3303 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
3304                         loop_vec_info old_loop_vinfo)
3305 {
3306   if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
3307     return false;
3308
3309   if (dump_enabled_p ())
3310     dump_printf_loc (MSG_NOTE, vect_location,
3311                      "***** Preferring vector mode %s to vector mode %s\n",
3312                      GET_MODE_NAME (new_loop_vinfo->vector_mode),
3313                      GET_MODE_NAME (old_loop_vinfo->vector_mode));
3314   return true;
3315 }
3316
3317 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
3318    not NULL.  Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
3319    MODE_I to the next mode useful to analyze.
3320    Return the loop_vinfo on success and wrapped null on failure.  */
3321
3322 static opt_loop_vec_info
3323 vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
3324                      const vect_loop_form_info *loop_form_info,
3325                      loop_vec_info main_loop_vinfo,
3326                      const vector_modes &vector_modes, unsigned &mode_i,
3327                      machine_mode &autodetected_vector_mode,
3328                      bool &fatal)
3329 {
3330   loop_vec_info loop_vinfo
3331     = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3332
3333   machine_mode vector_mode = vector_modes[mode_i];
3334   loop_vinfo->vector_mode = vector_mode;
3335   unsigned int suggested_unroll_factor = 1;
3336   bool slp_done_for_suggested_uf = false;
3337
3338   /* Run the main analysis.  */
3339   opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,
3340                                         &suggested_unroll_factor,
3341                                         slp_done_for_suggested_uf);
3342   if (dump_enabled_p ())
3343     dump_printf_loc (MSG_NOTE, vect_location,
3344                      "***** Analysis %s with vector mode %s\n",
3345                      res ? "succeeded" : " failed",
3346                      GET_MODE_NAME (loop_vinfo->vector_mode));
3347
3348   if (res && !main_loop_vinfo && suggested_unroll_factor > 1)
3349     {
3350       if (dump_enabled_p ())
3351         dump_printf_loc (MSG_NOTE, vect_location,
3352                          "***** Re-trying analysis for unrolling"
3353                          " with unroll factor %d and slp %s.\n",
3354                          suggested_unroll_factor,
3355                          slp_done_for_suggested_uf ? "on" : "off");
3356       loop_vec_info unroll_vinfo
3357         = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3358       unroll_vinfo->vector_mode = vector_mode;
3359       unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
3360       opt_result new_res = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL,
3361                                                 slp_done_for_suggested_uf);
3362       if (new_res)
3363         {
3364           delete loop_vinfo;
3365           loop_vinfo = unroll_vinfo;
3366         }
3367       else
3368         delete unroll_vinfo;
3369     }
3370
3371   /* Remember the autodetected vector mode.  */
3372   if (vector_mode == VOIDmode)
3373     autodetected_vector_mode = loop_vinfo->vector_mode;
3374
3375   /* Advance mode_i, first skipping modes that would result in the
3376      same analysis result.  */
3377   while (mode_i + 1 < vector_modes.length ()
3378          && vect_chooses_same_modes_p (loop_vinfo,
3379                                        vector_modes[mode_i + 1]))
3380     {
3381       if (dump_enabled_p ())
3382         dump_printf_loc (MSG_NOTE, vect_location,
3383                          "***** The result for vector mode %s would"
3384                          " be the same\n",
3385                          GET_MODE_NAME (vector_modes[mode_i + 1]));
3386       mode_i += 1;
3387     }
3388   if (mode_i + 1 < vector_modes.length ()
3389       && VECTOR_MODE_P (autodetected_vector_mode)
3390       && (related_vector_mode (vector_modes[mode_i + 1],
3391                                GET_MODE_INNER (autodetected_vector_mode))
3392           == autodetected_vector_mode)
3393       && (related_vector_mode (autodetected_vector_mode,
3394                                GET_MODE_INNER (vector_modes[mode_i + 1]))
3395           == vector_modes[mode_i + 1]))
3396     {
3397       if (dump_enabled_p ())
3398         dump_printf_loc (MSG_NOTE, vect_location,
3399                          "***** Skipping vector mode %s, which would"
3400                          " repeat the analysis for %s\n",
3401                          GET_MODE_NAME (vector_modes[mode_i + 1]),
3402                          GET_MODE_NAME (autodetected_vector_mode));
3403       mode_i += 1;
3404     }
3405   mode_i++;
3406
3407   if (!res)
3408     {
3409       delete loop_vinfo;
3410       if (fatal)
3411         gcc_checking_assert (main_loop_vinfo == NULL);
3412       return opt_loop_vec_info::propagate_failure (res);
3413     }
3414
3415   return opt_loop_vec_info::success (loop_vinfo);
3416 }
3417
3418 /* Function vect_analyze_loop.
3419
3420    Apply a set of analyses on LOOP, and create a loop_vec_info struct
3421    for it.  The different analyses will record information in the
3422    loop_vec_info struct.  */
3423 opt_loop_vec_info
3424 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
3425 {
3426   DUMP_VECT_SCOPE ("analyze_loop_nest");
3427
3428   if (loop_outer (loop)
3429       && loop_vec_info_for_loop (loop_outer (loop))
3430       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
3431     return opt_loop_vec_info::failure_at (vect_location,
3432                                           "outer-loop already vectorized.\n");
3433
3434   if (!find_loop_nest (loop, &shared->loop_nest))
3435     return opt_loop_vec_info::failure_at
3436       (vect_location,
3437        "not vectorized: loop nest containing two or more consecutive inner"
3438        " loops cannot be vectorized\n");
3439
3440   /* Analyze the loop form.  */
3441   vect_loop_form_info loop_form_info;
3442   opt_result res = vect_analyze_loop_form (loop, &loop_form_info);
3443   if (!res)
3444     {
3445       if (dump_enabled_p ())
3446         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3447                          "bad loop form.\n");
3448       return opt_loop_vec_info::propagate_failure (res);
3449     }
3450   if (!integer_onep (loop_form_info.assumptions))
3451     {
3452       /* We consider to vectorize this loop by versioning it under
3453          some assumptions.  In order to do this, we need to clear
3454          existing information computed by scev and niter analyzer.  */
3455       scev_reset_htab ();
3456       free_numbers_of_iterations_estimates (loop);
3457       /* Also set flag for this loop so that following scev and niter
3458          analysis are done under the assumptions.  */
3459       loop_constraint_set (loop, LOOP_C_FINITE);
3460     }
3461
3462   auto_vector_modes vector_modes;
3463   /* Autodetect first vector size we try.  */
3464   vector_modes.safe_push (VOIDmode);
3465   unsigned int autovec_flags
3466     = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
3467                                                     loop->simdlen != 0);
3468   bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
3469                              && !unlimited_cost_model (loop));
3470   machine_mode autodetected_vector_mode = VOIDmode;
3471   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
3472   unsigned int mode_i = 0;
3473   unsigned HOST_WIDE_INT simdlen = loop->simdlen;
3474
3475   /* Keep track of the VF for each mode.  Initialize all to 0 which indicates
3476      a mode has not been analyzed.  */
3477   auto_vec<poly_uint64, 8> cached_vf_per_mode;
3478   for (unsigned i = 0; i < vector_modes.length (); ++i)
3479     cached_vf_per_mode.safe_push (0);
3480
3481   /* First determine the main loop vectorization mode, either the first
3482      one that works, starting with auto-detecting the vector mode and then
3483      following the targets order of preference, or the one with the
3484      lowest cost if pick_lowest_cost_p.  */
3485   while (1)
3486     {
3487       bool fatal;
3488       unsigned int last_mode_i = mode_i;
3489       /* Set cached VF to -1 prior to analysis, which indicates a mode has
3490          failed.  */
3491       cached_vf_per_mode[last_mode_i] = -1;
3492       opt_loop_vec_info loop_vinfo
3493         = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3494                                NULL, vector_modes, mode_i,
3495                                autodetected_vector_mode, fatal);
3496       if (fatal)
3497         break;
3498
3499       if (loop_vinfo)
3500         {
3501           /*  Analyzis has been successful so update the VF value.  The
3502               VF should always be a multiple of unroll_factor and we want to
3503               capture the original VF here.  */
3504           cached_vf_per_mode[last_mode_i]
3505             = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
3506                          loop_vinfo->suggested_unroll_factor);
3507           /* Once we hit the desired simdlen for the first time,
3508              discard any previous attempts.  */
3509           if (simdlen
3510               && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3511             {
3512               delete first_loop_vinfo;
3513               first_loop_vinfo = opt_loop_vec_info::success (NULL);
3514               simdlen = 0;
3515             }
3516           else if (pick_lowest_cost_p
3517                    && first_loop_vinfo
3518                    && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3519             {
3520               /* Pick loop_vinfo over first_loop_vinfo.  */
3521               delete first_loop_vinfo;
3522               first_loop_vinfo = opt_loop_vec_info::success (NULL);
3523             }
3524           if (first_loop_vinfo == NULL)
3525             first_loop_vinfo = loop_vinfo;
3526           else
3527             {
3528               delete loop_vinfo;
3529               loop_vinfo = opt_loop_vec_info::success (NULL);
3530             }
3531
3532           /* Commit to first_loop_vinfo if we have no reason to try
3533              alternatives.  */
3534           if (!simdlen && !pick_lowest_cost_p)
3535             break;
3536         }
3537       if (mode_i == vector_modes.length ()
3538           || autodetected_vector_mode == VOIDmode)
3539         break;
3540
3541       /* Try the next biggest vector size.  */
3542       if (dump_enabled_p ())
3543         dump_printf_loc (MSG_NOTE, vect_location,
3544                          "***** Re-trying analysis with vector mode %s\n",
3545                          GET_MODE_NAME (vector_modes[mode_i]));
3546     }
3547   if (!first_loop_vinfo)
3548     return opt_loop_vec_info::propagate_failure (res);
3549
3550   if (dump_enabled_p ())
3551     dump_printf_loc (MSG_NOTE, vect_location,
3552                      "***** Choosing vector mode %s\n",
3553                      GET_MODE_NAME (first_loop_vinfo->vector_mode));
3554
3555   /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3556      enabled, SIMDUID is not set, it is the innermost loop and we have
3557      either already found the loop's SIMDLEN or there was no SIMDLEN to
3558      begin with.
3559      TODO: Enable epilogue vectorization for loops with SIMDUID set.  */
3560   bool vect_epilogues = (!simdlen
3561                          && loop->inner == NULL
3562                          && param_vect_epilogues_nomask
3563                          && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3564                          && !loop->simduid);
3565   if (!vect_epilogues)
3566     return first_loop_vinfo;
3567
3568   /* Now analyze first_loop_vinfo for epilogue vectorization.  */
3569   poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3570
3571   /* For epilogues start the analysis from the first mode.  The motivation
3572      behind starting from the beginning comes from cases where the VECTOR_MODES
3573      array may contain length-agnostic and length-specific modes.  Their
3574      ordering is not guaranteed, so we could end up picking a mode for the main
3575      loop that is after the epilogue's optimal mode.  */
3576   vector_modes[0] = autodetected_vector_mode;
3577   mode_i = 0;
3578
3579   bool supports_partial_vectors =
3580     partial_vectors_supported_p () && param_vect_partial_vector_usage != 0;
3581   poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3582
3583   while (1)
3584     {
3585       /* If the target does not support partial vectors we can shorten the
3586          number of modes to analyze for the epilogue as we know we can't pick a
3587          mode that would lead to a VF at least as big as the
3588          FIRST_VINFO_VF.  */
3589       if (!supports_partial_vectors
3590           && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3591         {
3592           mode_i++;
3593           if (mode_i == vector_modes.length ())
3594             break;
3595           continue;
3596         }
3597
3598       if (dump_enabled_p ())
3599         dump_printf_loc (MSG_NOTE, vect_location,
3600                          "***** Re-trying epilogue analysis with vector "
3601                          "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3602
3603       bool fatal;
3604       opt_loop_vec_info loop_vinfo
3605         = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3606                                first_loop_vinfo,
3607                                vector_modes, mode_i,
3608                                autodetected_vector_mode, fatal);
3609       if (fatal)
3610         break;
3611
3612       if (loop_vinfo)
3613         {
3614           if (pick_lowest_cost_p)
3615             {
3616               /* Keep trying to roll back vectorization attempts while the
3617                  loop_vec_infos they produced were worse than this one.  */
3618               vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3619               while (!vinfos.is_empty ()
3620                      && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3621                 {
3622                   gcc_assert (vect_epilogues);
3623                   delete vinfos.pop ();
3624                 }
3625             }
3626           /* For now only allow one epilogue loop.  */
3627           if (first_loop_vinfo->epilogue_vinfos.is_empty ())
3628             {
3629               first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3630               poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3631               gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3632                           || maybe_ne (lowest_th, 0U));
3633               /* Keep track of the known smallest versioning
3634                  threshold.  */
3635               if (ordered_p (lowest_th, th))
3636                 lowest_th = ordered_min (lowest_th, th);
3637             }
3638           else
3639             {
3640               delete loop_vinfo;
3641               loop_vinfo = opt_loop_vec_info::success (NULL);
3642             }
3643
3644           /* For now only allow one epilogue loop, but allow
3645              pick_lowest_cost_p to replace it, so commit to the
3646              first epilogue if we have no reason to try alternatives.  */
3647           if (!pick_lowest_cost_p)
3648             break;
3649         }
3650
3651       if (mode_i == vector_modes.length ())
3652         break;
3653
3654     }
3655
3656   if (!first_loop_vinfo->epilogue_vinfos.is_empty ())
3657     {
3658       LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3659       if (dump_enabled_p ())
3660         dump_printf_loc (MSG_NOTE, vect_location,
3661                          "***** Choosing epilogue vector mode %s\n",
3662                          GET_MODE_NAME
3663                            (first_loop_vinfo->epilogue_vinfos[0]->vector_mode));
3664     }
3665
3666   return first_loop_vinfo;
3667 }
3668
3669 /* Return true if there is an in-order reduction function for CODE, storing
3670    it in *REDUC_FN if so.  */
3671
3672 static bool
3673 fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3674 {
3675   if (code == PLUS_EXPR)
3676     {
3677       *reduc_fn = IFN_FOLD_LEFT_PLUS;
3678       return true;
3679     }
3680   return false;
3681 }
3682
3683 /* Function reduction_fn_for_scalar_code
3684
3685    Input:
3686    CODE - tree_code of a reduction operations.
3687
3688    Output:
3689    REDUC_FN - the corresponding internal function to be used to reduce the
3690       vector of partial results into a single scalar result, or IFN_LAST
3691       if the operation is a supported reduction operation, but does not have
3692       such an internal function.
3693
3694    Return FALSE if CODE currently cannot be vectorized as reduction.  */
3695
3696 bool
3697 reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3698 {
3699   if (code.is_tree_code ())
3700     switch (tree_code (code))
3701       {
3702       case MAX_EXPR:
3703         *reduc_fn = IFN_REDUC_MAX;
3704         return true;
3705
3706       case MIN_EXPR:
3707         *reduc_fn = IFN_REDUC_MIN;
3708         return true;
3709
3710       case PLUS_EXPR:
3711         *reduc_fn = IFN_REDUC_PLUS;
3712         return true;
3713
3714       case BIT_AND_EXPR:
3715         *reduc_fn = IFN_REDUC_AND;
3716         return true;
3717
3718       case BIT_IOR_EXPR:
3719         *reduc_fn = IFN_REDUC_IOR;
3720         return true;
3721
3722       case BIT_XOR_EXPR:
3723         *reduc_fn = IFN_REDUC_XOR;
3724         return true;
3725
3726       case MULT_EXPR:
3727       case MINUS_EXPR:
3728         *reduc_fn = IFN_LAST;
3729         return true;
3730
3731       default:
3732         return false;
3733       }
3734   else
3735     switch (combined_fn (code))
3736       {
3737       CASE_CFN_FMAX:
3738         *reduc_fn = IFN_REDUC_FMAX;
3739         return true;
3740
3741       CASE_CFN_FMIN:
3742         *reduc_fn = IFN_REDUC_FMIN;
3743         return true;
3744
3745       default:
3746         return false;
3747       }
3748 }
3749
3750 /* If there is a neutral value X such that a reduction would not be affected
3751    by the introduction of additional X elements, return that X, otherwise
3752    return null.  CODE is the code of the reduction and SCALAR_TYPE is type
3753    of the scalar elements.  If the reduction has just a single initial value
3754    then INITIAL_VALUE is that value, otherwise it is null.  */
3755
3756 tree
3757 neutral_op_for_reduction (tree scalar_type, code_helper code,
3758                           tree initial_value)
3759 {
3760   if (code.is_tree_code ())
3761     switch (tree_code (code))
3762       {
3763       case WIDEN_SUM_EXPR:
3764       case DOT_PROD_EXPR:
3765       case SAD_EXPR:
3766       case PLUS_EXPR:
3767       case MINUS_EXPR:
3768       case BIT_IOR_EXPR:
3769       case BIT_XOR_EXPR:
3770         return build_zero_cst (scalar_type);
3771
3772       case MULT_EXPR:
3773         return build_one_cst (scalar_type);
3774
3775       case BIT_AND_EXPR:
3776         return build_all_ones_cst (scalar_type);
3777
3778       case MAX_EXPR:
3779       case MIN_EXPR:
3780         return initial_value;
3781
3782       default:
3783         return NULL_TREE;
3784       }
3785   else
3786     switch (combined_fn (code))
3787       {
3788       CASE_CFN_FMIN:
3789       CASE_CFN_FMAX:
3790         return initial_value;
3791
3792       default:
3793         return NULL_TREE;
3794       }
3795 }
3796
3797 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
3798    STMT is printed with a message MSG. */
3799
3800 static void
3801 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3802 {
3803   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3804 }
3805
3806 /* Return true if we need an in-order reduction for operation CODE
3807    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3808    overflow must wrap.  */
3809
3810 bool
3811 needs_fold_left_reduction_p (tree type, code_helper code)
3812 {
3813   /* CHECKME: check for !flag_finite_math_only too?  */
3814   if (SCALAR_FLOAT_TYPE_P (type))
3815     {
3816       if (code.is_tree_code ())
3817         switch (tree_code (code))
3818           {
3819           case MIN_EXPR:
3820           case MAX_EXPR:
3821             return false;
3822
3823           default:
3824             return !flag_associative_math;
3825           }
3826       else
3827         switch (combined_fn (code))
3828           {
3829           CASE_CFN_FMIN:
3830           CASE_CFN_FMAX:
3831             return false;
3832
3833           default:
3834             return !flag_associative_math;
3835           }
3836     }
3837
3838   if (INTEGRAL_TYPE_P (type))
3839     return (!code.is_tree_code ()
3840             || !operation_no_trapping_overflow (type, tree_code (code)));
3841
3842   if (SAT_FIXED_POINT_TYPE_P (type))
3843     return true;
3844
3845   return false;
3846 }
3847
3848 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3849    has a handled computation expression.  Store the main reduction
3850    operation in *CODE.  */
3851
3852 static bool
3853 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3854                       tree loop_arg, code_helper *code,
3855                       vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3856 {
3857   auto_bitmap visited;
3858   tree lookfor = PHI_RESULT (phi);
3859   ssa_op_iter curri;
3860   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3861   while (USE_FROM_PTR (curr) != loop_arg)
3862     curr = op_iter_next_use (&curri);
3863   curri.i = curri.numops;
3864   do
3865     {
3866       path.safe_push (std::make_pair (curri, curr));
3867       tree use = USE_FROM_PTR (curr);
3868       if (use == lookfor)
3869         break;
3870       gimple *def = SSA_NAME_DEF_STMT (use);
3871       if (gimple_nop_p (def)
3872           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3873         {
3874 pop:
3875           do
3876             {
3877               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3878               curri = x.first;
3879               curr = x.second;
3880               do
3881                 curr = op_iter_next_use (&curri);
3882               /* Skip already visited or non-SSA operands (from iterating
3883                  over PHI args).  */
3884               while (curr != NULL_USE_OPERAND_P
3885                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3886                          || ! bitmap_set_bit (visited,
3887                                               SSA_NAME_VERSION
3888                                                 (USE_FROM_PTR (curr)))));
3889             }
3890           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3891           if (curr == NULL_USE_OPERAND_P)
3892             break;
3893         }
3894       else
3895         {
3896           if (gimple_code (def) == GIMPLE_PHI)
3897             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3898           else
3899             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3900           while (curr != NULL_USE_OPERAND_P
3901                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3902                      || ! bitmap_set_bit (visited,
3903                                           SSA_NAME_VERSION
3904                                             (USE_FROM_PTR (curr)))))
3905             curr = op_iter_next_use (&curri);
3906           if (curr == NULL_USE_OPERAND_P)
3907             goto pop;
3908         }
3909     }
3910   while (1);
3911   if (dump_file && (dump_flags & TDF_DETAILS))
3912     {
3913       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3914       unsigned i;
3915       std::pair<ssa_op_iter, use_operand_p> *x;
3916       FOR_EACH_VEC_ELT (path, i, x)
3917         dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3918       dump_printf (MSG_NOTE, "\n");
3919     }
3920
3921   /* Check whether the reduction path detected is valid.  */
3922   bool fail = path.length () == 0;
3923   bool neg = false;
3924   int sign = -1;
3925   *code = ERROR_MARK;
3926   for (unsigned i = 1; i < path.length (); ++i)
3927     {
3928       gimple *use_stmt = USE_STMT (path[i].second);
3929       gimple_match_op op;
3930       if (!gimple_extract_op (use_stmt, &op))
3931         {
3932           fail = true;
3933           break;
3934         }
3935       unsigned int opi = op.num_ops;
3936       if (gassign *assign = dyn_cast<gassign *> (use_stmt))
3937         {
3938           /* The following make sure we can compute the operand index
3939              easily plus it mostly disallows chaining via COND_EXPR condition
3940              operands.  */
3941           for (opi = 0; opi < op.num_ops; ++opi)
3942             if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
3943               break;
3944         }
3945       else if (gcall *call = dyn_cast<gcall *> (use_stmt))
3946         {
3947           for (opi = 0; opi < op.num_ops; ++opi)
3948             if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
3949               break;
3950         }
3951       if (opi == op.num_ops)
3952         {
3953           fail = true;
3954           break;
3955         }
3956       op.code = canonicalize_code (op.code, op.type);
3957       if (op.code == MINUS_EXPR)
3958         {
3959           op.code = PLUS_EXPR;
3960           /* Track whether we negate the reduction value each iteration.  */
3961           if (op.ops[1] == op.ops[opi])
3962             neg = ! neg;
3963         }
3964       if (CONVERT_EXPR_CODE_P (op.code)
3965           && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
3966         ;
3967       else if (*code == ERROR_MARK)
3968         {
3969           *code = op.code;
3970           sign = TYPE_SIGN (op.type);
3971         }
3972       else if (op.code != *code)
3973         {
3974           fail = true;
3975           break;
3976         }
3977       else if ((op.code == MIN_EXPR
3978                 || op.code == MAX_EXPR)
3979                && sign != TYPE_SIGN (op.type))
3980         {
3981           fail = true;
3982           break;
3983         }
3984       /* Check there's only a single stmt the op is used on.  For the
3985          not value-changing tail and the last stmt allow out-of-loop uses.
3986          ???  We could relax this and handle arbitrary live stmts by
3987          forcing a scalar epilogue for example.  */
3988       imm_use_iterator imm_iter;
3989       gimple *op_use_stmt;
3990       unsigned cnt = 0;
3991       FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
3992         if (!is_gimple_debug (op_use_stmt)
3993             && (*code != ERROR_MARK
3994                 || flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))))
3995           {
3996             /* We want to allow x + x but not x < 1 ? x : 2.  */
3997             if (is_gimple_assign (op_use_stmt)
3998                 && gimple_assign_rhs_code (op_use_stmt) == COND_EXPR)
3999               {
4000                 use_operand_p use_p;
4001                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4002                   cnt++;
4003               }
4004             else
4005               cnt++;
4006           }
4007       if (cnt != 1)
4008         {
4009           fail = true;
4010           break;
4011         }
4012     }
4013   return ! fail && ! neg && *code != ERROR_MARK;
4014 }
4015
4016 bool
4017 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
4018                       tree loop_arg, enum tree_code code)
4019 {
4020   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4021   code_helper code_;
4022   return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
4023           && code_ == code);
4024 }
4025
4026
4027
4028 /* Function vect_is_simple_reduction
4029
4030    (1) Detect a cross-iteration def-use cycle that represents a simple
4031    reduction computation.  We look for the following pattern:
4032
4033    loop_header:
4034      a1 = phi < a0, a2 >
4035      a3 = ...
4036      a2 = operation (a3, a1)
4037
4038    or
4039
4040    a3 = ...
4041    loop_header:
4042      a1 = phi < a0, a2 >
4043      a2 = operation (a3, a1)
4044
4045    such that:
4046    1. operation is commutative and associative and it is safe to
4047       change the order of the computation
4048    2. no uses for a2 in the loop (a2 is used out of the loop)
4049    3. no uses of a1 in the loop besides the reduction operation
4050    4. no uses of a1 outside the loop.
4051
4052    Conditions 1,4 are tested here.
4053    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
4054
4055    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
4056    nested cycles.
4057
4058    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
4059    reductions:
4060
4061      a1 = phi < a0, a2 >
4062      inner loop (def of a3)
4063      a2 = phi < a3 >
4064
4065    (4) Detect condition expressions, ie:
4066      for (int i = 0; i < N; i++)
4067        if (a[i] < val)
4068         ret_val = a[i];
4069
4070 */
4071
4072 static stmt_vec_info
4073 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
4074                           bool *double_reduc, bool *reduc_chain_p, bool slp)
4075 {
4076   gphi *phi = as_a <gphi *> (phi_info->stmt);
4077   gimple *phi_use_stmt = NULL;
4078   imm_use_iterator imm_iter;
4079   use_operand_p use_p;
4080
4081   *double_reduc = false;
4082   *reduc_chain_p = false;
4083   STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
4084
4085   tree phi_name = PHI_RESULT (phi);
4086   /* ???  If there are no uses of the PHI result the inner loop reduction
4087      won't be detected as possibly double-reduction by vectorizable_reduction
4088      because that tries to walk the PHI arg from the preheader edge which
4089      can be constant.  See PR60382.  */
4090   if (has_zero_uses (phi_name))
4091     return NULL;
4092   class loop *loop = (gimple_bb (phi))->loop_father;
4093   unsigned nphi_def_loop_uses = 0;
4094   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
4095     {
4096       gimple *use_stmt = USE_STMT (use_p);
4097       if (is_gimple_debug (use_stmt))
4098         continue;
4099
4100       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4101         {
4102           if (dump_enabled_p ())
4103             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4104                              "intermediate value used outside loop.\n");
4105
4106           return NULL;
4107         }
4108
4109       nphi_def_loop_uses++;
4110       phi_use_stmt = use_stmt;
4111     }
4112
4113   tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
4114   if (TREE_CODE (latch_def) != SSA_NAME)
4115     {
4116       if (dump_enabled_p ())
4117         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4118                          "reduction: not ssa_name: %T\n", latch_def);
4119       return NULL;
4120     }
4121
4122   stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
4123   if (!def_stmt_info
4124       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
4125     return NULL;
4126
4127   bool nested_in_vect_loop
4128     = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
4129   unsigned nlatch_def_loop_uses = 0;
4130   auto_vec<gphi *, 3> lcphis;
4131   bool inner_loop_of_double_reduc = false;
4132   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
4133     {
4134       gimple *use_stmt = USE_STMT (use_p);
4135       if (is_gimple_debug (use_stmt))
4136         continue;
4137       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4138         nlatch_def_loop_uses++;
4139       else
4140         {
4141           /* We can have more than one loop-closed PHI.  */
4142           lcphis.safe_push (as_a <gphi *> (use_stmt));
4143           if (nested_in_vect_loop
4144               && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
4145                   == vect_double_reduction_def))
4146             inner_loop_of_double_reduc = true;
4147         }
4148     }
4149
4150   /* If we are vectorizing an inner reduction we are executing that
4151      in the original order only in case we are not dealing with a
4152      double reduction.  */
4153   if (nested_in_vect_loop && !inner_loop_of_double_reduc)
4154     {
4155       if (dump_enabled_p ())
4156         report_vect_op (MSG_NOTE, def_stmt_info->stmt,
4157                         "detected nested cycle: ");
4158       return def_stmt_info;
4159     }
4160
4161   /* When the inner loop of a double reduction ends up with more than
4162      one loop-closed PHI we have failed to classify alternate such
4163      PHIs as double reduction, leading to wrong code.  See PR103237.  */
4164   if (inner_loop_of_double_reduc && lcphis.length () != 1)
4165     {
4166       if (dump_enabled_p ())
4167         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4168                          "unhandle double reduction\n");
4169       return NULL;
4170     }
4171
4172   /* If this isn't a nested cycle or if the nested cycle reduction value
4173      is used ouside of the inner loop we cannot handle uses of the reduction
4174      value.  */
4175   if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
4176     {
4177       if (dump_enabled_p ())
4178         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4179                          "reduction used in loop.\n");
4180       return NULL;
4181     }
4182
4183   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
4184      defined in the inner loop.  */
4185   if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
4186     {
4187       tree op1 = PHI_ARG_DEF (def_stmt, 0);
4188       if (gimple_phi_num_args (def_stmt) != 1
4189           || TREE_CODE (op1) != SSA_NAME)
4190         {
4191           if (dump_enabled_p ())
4192             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4193                              "unsupported phi node definition.\n");
4194
4195           return NULL;
4196         }
4197
4198       /* Verify there is an inner cycle composed of the PHI phi_use_stmt
4199          and the latch definition op1.  */
4200       gimple *def1 = SSA_NAME_DEF_STMT (op1);
4201       if (gimple_bb (def1)
4202           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4203           && loop->inner
4204           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
4205           && (is_gimple_assign (def1) || is_gimple_call (def1))
4206           && is_a <gphi *> (phi_use_stmt)
4207           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt))
4208           && (op1 == PHI_ARG_DEF_FROM_EDGE (phi_use_stmt,
4209                                             loop_latch_edge (loop->inner))))
4210         {
4211           if (dump_enabled_p ())
4212             report_vect_op (MSG_NOTE, def_stmt,
4213                             "detected double reduction: ");
4214
4215           *double_reduc = true;
4216           return def_stmt_info;
4217         }
4218
4219       return NULL;
4220     }
4221
4222   /* Look for the expression computing latch_def from then loop PHI result.  */
4223   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4224   code_helper code;
4225   if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
4226                             path))
4227     {
4228       STMT_VINFO_REDUC_CODE (phi_info) = code;
4229       if (code == COND_EXPR && !nested_in_vect_loop)
4230         STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
4231
4232       /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
4233          reduction chain for which the additional restriction is that
4234          all operations in the chain are the same.  */
4235       auto_vec<stmt_vec_info, 8> reduc_chain;
4236       unsigned i;
4237       bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
4238       for (i = path.length () - 1; i >= 1; --i)
4239         {
4240           gimple *stmt = USE_STMT (path[i].second);
4241           stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
4242           gimple_match_op op;
4243           if (!gimple_extract_op (stmt, &op))
4244             gcc_unreachable ();
4245           if (gassign *assign = dyn_cast<gassign *> (stmt))
4246             STMT_VINFO_REDUC_IDX (stmt_info)
4247               = path[i].second->use - gimple_assign_rhs1_ptr (assign);
4248           else
4249             {
4250               gcall *call = as_a<gcall *> (stmt);
4251               STMT_VINFO_REDUC_IDX (stmt_info)
4252                 = path[i].second->use - gimple_call_arg_ptr (call, 0);
4253             }
4254           bool leading_conversion = (CONVERT_EXPR_CODE_P (op.code)
4255                                      && (i == 1 || i == path.length () - 1));
4256           if ((op.code != code && !leading_conversion)
4257               /* We can only handle the final value in epilogue
4258                  generation for reduction chains.  */
4259               || (i != 1 && !has_single_use (gimple_get_lhs (stmt))))
4260             is_slp_reduc = false;
4261           /* For reduction chains we support a trailing/leading
4262              conversions.  We do not store those in the actual chain.  */
4263           if (leading_conversion)
4264             continue;
4265           reduc_chain.safe_push (stmt_info);
4266         }
4267       if (slp && is_slp_reduc && reduc_chain.length () > 1)
4268         {
4269           for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
4270             {
4271               REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
4272               REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
4273             }
4274           REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
4275           REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
4276
4277           /* Save the chain for further analysis in SLP detection.  */
4278           LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
4279           REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
4280
4281           *reduc_chain_p = true;
4282           if (dump_enabled_p ())
4283             dump_printf_loc (MSG_NOTE, vect_location,
4284                             "reduction: detected reduction chain\n");
4285         }
4286       else if (dump_enabled_p ())
4287         dump_printf_loc (MSG_NOTE, vect_location,
4288                          "reduction: detected reduction\n");
4289
4290       return def_stmt_info;
4291     }
4292
4293   if (dump_enabled_p ())
4294     dump_printf_loc (MSG_NOTE, vect_location,
4295                      "reduction: unknown pattern\n");
4296
4297   return NULL;
4298 }
4299
4300 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
4301    PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
4302    or -1 if not known.  */
4303
4304 static int
4305 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
4306 {
4307   int assumed_vf = vect_vf_for_cost (loop_vinfo);
4308   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
4309     {
4310       if (dump_enabled_p ())
4311         dump_printf_loc (MSG_NOTE, vect_location,
4312                          "cost model: epilogue peel iters set to vf/2 "
4313                          "because loop iterations are unknown .\n");
4314       return assumed_vf / 2;
4315     }
4316   else
4317     {
4318       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
4319       peel_iters_prologue = MIN (niters, peel_iters_prologue);
4320       int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
4321       /* If we need to peel for gaps, but no peeling is required, we have to
4322          peel VF iterations.  */
4323       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
4324         peel_iters_epilogue = assumed_vf;
4325       return peel_iters_epilogue;
4326     }
4327 }
4328
4329 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
4330 int
4331 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
4332                              int *peel_iters_epilogue,
4333                              stmt_vector_for_cost *scalar_cost_vec,
4334                              stmt_vector_for_cost *prologue_cost_vec,
4335                              stmt_vector_for_cost *epilogue_cost_vec)
4336 {
4337   int retval = 0;
4338
4339   *peel_iters_epilogue
4340     = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
4341
4342   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
4343     {
4344       /* If peeled iterations are known but number of scalar loop
4345          iterations are unknown, count a taken branch per peeled loop.  */
4346       if (peel_iters_prologue > 0)
4347         retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
4348                                    vect_prologue);
4349       if (*peel_iters_epilogue > 0)
4350         retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
4351                                     vect_epilogue);
4352     }
4353
4354   stmt_info_for_cost *si;
4355   int j;
4356   if (peel_iters_prologue)
4357     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4358       retval += record_stmt_cost (prologue_cost_vec,
4359                                   si->count * peel_iters_prologue,
4360                                   si->kind, si->stmt_info, si->misalign,
4361                                   vect_prologue);
4362   if (*peel_iters_epilogue)
4363     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4364       retval += record_stmt_cost (epilogue_cost_vec,
4365                                   si->count * *peel_iters_epilogue,
4366                                   si->kind, si->stmt_info, si->misalign,
4367                                   vect_epilogue);
4368
4369   return retval;
4370 }
4371
4372 /* Function vect_estimate_min_profitable_iters
4373
4374    Return the number of iterations required for the vector version of the
4375    loop to be profitable relative to the cost of the scalar version of the
4376    loop.
4377
4378    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
4379    of iterations for vectorization.  -1 value means loop vectorization
4380    is not profitable.  This returned value may be used for dynamic
4381    profitability check.
4382
4383    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
4384    for static check against estimated number of iterations.  */
4385
4386 static void
4387 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
4388                                     int *ret_min_profitable_niters,
4389                                     int *ret_min_profitable_estimate,
4390                                     unsigned *suggested_unroll_factor)
4391 {
4392   int min_profitable_iters;
4393   int min_profitable_estimate;
4394   int peel_iters_prologue;
4395   int peel_iters_epilogue;
4396   unsigned vec_inside_cost = 0;
4397   int vec_outside_cost = 0;
4398   unsigned vec_prologue_cost = 0;
4399   unsigned vec_epilogue_cost = 0;
4400   int scalar_single_iter_cost = 0;
4401   int scalar_outside_cost = 0;
4402   int assumed_vf = vect_vf_for_cost (loop_vinfo);
4403   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
4404   vector_costs *target_cost_data = loop_vinfo->vector_costs;
4405
4406   /* Cost model disabled.  */
4407   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
4408     {
4409       if (dump_enabled_p ())
4410         dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
4411       *ret_min_profitable_niters = 0;
4412       *ret_min_profitable_estimate = 0;
4413       return;
4414     }
4415
4416   /* Requires loop versioning tests to handle misalignment.  */
4417   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
4418     {
4419       /*  FIXME: Make cost depend on complexity of individual check.  */
4420       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
4421       (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4422       if (dump_enabled_p ())
4423         dump_printf (MSG_NOTE,
4424                      "cost model: Adding cost of checks for loop "
4425                      "versioning to treat misalignment.\n");
4426     }
4427
4428   /* Requires loop versioning with alias checks.  */
4429   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4430     {
4431       /*  FIXME: Make cost depend on complexity of individual check.  */
4432       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
4433       (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4434       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
4435       if (len)
4436         /* Count LEN - 1 ANDs and LEN comparisons.  */
4437         (void) add_stmt_cost (target_cost_data, len * 2 - 1,
4438                               scalar_stmt, vect_prologue);
4439       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
4440       if (len)
4441         {
4442           /* Count LEN - 1 ANDs and LEN comparisons.  */
4443           unsigned int nstmts = len * 2 - 1;
4444           /* +1 for each bias that needs adding.  */
4445           for (unsigned int i = 0; i < len; ++i)
4446             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
4447               nstmts += 1;
4448           (void) add_stmt_cost (target_cost_data, nstmts,
4449                                 scalar_stmt, vect_prologue);
4450         }
4451       if (dump_enabled_p ())
4452         dump_printf (MSG_NOTE,
4453                      "cost model: Adding cost of checks for loop "
4454                      "versioning aliasing.\n");
4455     }
4456
4457   /* Requires loop versioning with niter checks.  */
4458   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
4459     {
4460       /*  FIXME: Make cost depend on complexity of individual check.  */
4461       (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
4462                             NULL, NULL, NULL_TREE, 0, vect_prologue);
4463       if (dump_enabled_p ())
4464         dump_printf (MSG_NOTE,
4465                      "cost model: Adding cost of checks for loop "
4466                      "versioning niters.\n");
4467     }
4468
4469   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4470     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4471                           vect_prologue);
4472
4473   /* Count statements in scalar loop.  Using this as scalar cost for a single
4474      iteration for now.
4475
4476      TODO: Add outer loop support.
4477
4478      TODO: Consider assigning different costs to different scalar
4479      statements.  */
4480
4481   scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost ();
4482
4483   /* Add additional cost for the peeled instructions in prologue and epilogue
4484      loop.  (For fully-masked loops there will be no peeling.)
4485
4486      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4487      at compile-time - we assume it's vf/2 (the worst would be vf-1).
4488
4489      TODO: Build an expression that represents peel_iters for prologue and
4490      epilogue to be used in a run-time test.  */
4491
4492   bool prologue_need_br_taken_cost = false;
4493   bool prologue_need_br_not_taken_cost = false;
4494
4495   /* Calculate peel_iters_prologue.  */
4496   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4497     peel_iters_prologue = 0;
4498   else if (npeel < 0)
4499     {
4500       peel_iters_prologue = assumed_vf / 2;
4501       if (dump_enabled_p ())
4502         dump_printf (MSG_NOTE, "cost model: "
4503                      "prologue peel iters set to vf/2.\n");
4504
4505       /* If peeled iterations are unknown, count a taken branch and a not taken
4506          branch per peeled loop.  Even if scalar loop iterations are known,
4507          vector iterations are not known since peeled prologue iterations are
4508          not known.  Hence guards remain the same.  */
4509       prologue_need_br_taken_cost = true;
4510       prologue_need_br_not_taken_cost = true;
4511     }
4512   else
4513     {
4514       peel_iters_prologue = npeel;
4515       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4516         /* If peeled iterations are known but number of scalar loop
4517            iterations are unknown, count a taken branch per peeled loop.  */
4518         prologue_need_br_taken_cost = true;
4519     }
4520
4521   bool epilogue_need_br_taken_cost = false;
4522   bool epilogue_need_br_not_taken_cost = false;
4523
4524   /* Calculate peel_iters_epilogue.  */
4525   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4526     /* We need to peel exactly one iteration for gaps.  */
4527     peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4528   else if (npeel < 0)
4529     {
4530       /* If peeling for alignment is unknown, loop bound of main loop
4531          becomes unknown.  */
4532       peel_iters_epilogue = assumed_vf / 2;
4533       if (dump_enabled_p ())
4534         dump_printf (MSG_NOTE, "cost model: "
4535                      "epilogue peel iters set to vf/2 because "
4536                      "peeling for alignment is unknown.\n");
4537
4538       /* See the same reason above in peel_iters_prologue calculation.  */
4539       epilogue_need_br_taken_cost = true;
4540       epilogue_need_br_not_taken_cost = true;
4541     }
4542   else
4543     {
4544       peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4545       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4546         /* If peeled iterations are known but number of scalar loop
4547            iterations are unknown, count a taken branch per peeled loop.  */
4548         epilogue_need_br_taken_cost = true;
4549     }
4550
4551   stmt_info_for_cost *si;
4552   int j;
4553   /* Add costs associated with peel_iters_prologue.  */
4554   if (peel_iters_prologue)
4555     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4556       {
4557         (void) add_stmt_cost (target_cost_data,
4558                               si->count * peel_iters_prologue, si->kind,
4559                               si->stmt_info, si->node, si->vectype,
4560                               si->misalign, vect_prologue);
4561       }
4562
4563   /* Add costs associated with peel_iters_epilogue.  */
4564   if (peel_iters_epilogue)
4565     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4566       {
4567         (void) add_stmt_cost (target_cost_data,
4568                               si->count * peel_iters_epilogue, si->kind,
4569                               si->stmt_info, si->node, si->vectype,
4570                               si->misalign, vect_epilogue);
4571       }
4572
4573   /* Add possible cond_branch_taken/cond_branch_not_taken cost.  */
4574
4575   if (prologue_need_br_taken_cost)
4576     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4577                           vect_prologue);
4578
4579   if (prologue_need_br_not_taken_cost)
4580     (void) add_stmt_cost (target_cost_data, 1,
4581                           cond_branch_not_taken, vect_prologue);
4582
4583   if (epilogue_need_br_taken_cost)
4584     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4585                           vect_epilogue);
4586
4587   if (epilogue_need_br_not_taken_cost)
4588     (void) add_stmt_cost (target_cost_data, 1,
4589                           cond_branch_not_taken, vect_epilogue);
4590
4591   /* Take care of special costs for rgroup controls of partial vectors.  */
4592   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4593       && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4594           == vect_partial_vectors_avx512))
4595     {
4596       /* Calculate how many masks we need to generate.  */
4597       unsigned int num_masks = 0;
4598       bool need_saturation = false;
4599       for (auto rgm : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
4600         if (rgm.type)
4601           {
4602             unsigned nvectors = rgm.factor;
4603             num_masks += nvectors;
4604             if (TYPE_PRECISION (TREE_TYPE (rgm.compare_type))
4605                 < TYPE_PRECISION (LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo)))
4606               need_saturation = true;
4607           }
4608
4609       /* ???  The target isn't able to identify the costs below as
4610          producing masks so it cannot penaltize cases where we'd run
4611          out of mask registers for example.  */
4612
4613       /* ???  We are also failing to account for smaller vector masks
4614          we generate by splitting larger masks in vect_get_loop_mask.  */
4615
4616       /* In the worst case, we need to generate each mask in the prologue
4617          and in the loop body.  We need one splat per group and one
4618          compare per mask.
4619
4620          Sometimes the prologue mask will fold to a constant,
4621          so the actual prologue cost might be smaller.  However, it's
4622          simpler and safer to use the worst-case cost; if this ends up
4623          being the tie-breaker between vectorizing or not, then it's
4624          probably better not to vectorize.  */
4625       (void) add_stmt_cost (target_cost_data,
4626                             num_masks
4627                             + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4628                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4629                             vect_prologue);
4630       (void) add_stmt_cost (target_cost_data,
4631                             num_masks
4632                             + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4633                             vector_stmt, NULL, NULL, NULL_TREE, 0, vect_body);
4634
4635       /* When we need saturation we need it both in the prologue and
4636          the epilogue.  */
4637       if (need_saturation)
4638         {
4639           (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4640                                 NULL, NULL, NULL_TREE, 0, vect_prologue);
4641           (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4642                                 NULL, NULL, NULL_TREE, 0, vect_body);
4643         }
4644     }
4645   else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4646            && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4647                == vect_partial_vectors_while_ult))
4648     {
4649       /* Calculate how many masks we need to generate.  */
4650       unsigned int num_masks = 0;
4651       rgroup_controls *rgm;
4652       unsigned int num_vectors_m1;
4653       FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec,
4654                         num_vectors_m1, rgm)
4655         if (rgm->type)
4656           num_masks += num_vectors_m1 + 1;
4657       gcc_assert (num_masks > 0);
4658
4659       /* In the worst case, we need to generate each mask in the prologue
4660          and in the loop body.  One of the loop body mask instructions
4661          replaces the comparison in the scalar loop, and since we don't
4662          count the scalar comparison against the scalar body, we shouldn't
4663          count that vector instruction against the vector body either.
4664
4665          Sometimes we can use unpacks instead of generating prologue
4666          masks and sometimes the prologue mask will fold to a constant,
4667          so the actual prologue cost might be smaller.  However, it's
4668          simpler and safer to use the worst-case cost; if this ends up
4669          being the tie-breaker between vectorizing or not, then it's
4670          probably better not to vectorize.  */
4671       (void) add_stmt_cost (target_cost_data, num_masks,
4672                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4673                             vect_prologue);
4674       (void) add_stmt_cost (target_cost_data, num_masks - 1,
4675                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4676                             vect_body);
4677     }
4678   else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4679     {
4680       /* Referring to the functions vect_set_loop_condition_partial_vectors
4681          and vect_set_loop_controls_directly, we need to generate each
4682          length in the prologue and in the loop body if required. Although
4683          there are some possible optimizations, we consider the worst case
4684          here.  */
4685
4686       bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4687       signed char partial_load_store_bias
4688         = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4689       bool need_iterate_p
4690         = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4691            && !vect_known_niters_smaller_than_vf (loop_vinfo));
4692
4693       /* Calculate how many statements to be added.  */
4694       unsigned int prologue_stmts = 0;
4695       unsigned int body_stmts = 0;
4696
4697       rgroup_controls *rgc;
4698       unsigned int num_vectors_m1;
4699       FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4700         if (rgc->type)
4701           {
4702             /* May need one SHIFT for nitems_total computation.  */
4703             unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4704             if (nitems != 1 && !niters_known_p)
4705               prologue_stmts += 1;
4706
4707             /* May need one MAX and one MINUS for wrap around.  */
4708             if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4709               prologue_stmts += 2;
4710
4711             /* Need one MAX and one MINUS for each batch limit excepting for
4712                the 1st one.  */
4713             prologue_stmts += num_vectors_m1 * 2;
4714
4715             unsigned int num_vectors = num_vectors_m1 + 1;
4716
4717             /* Need to set up lengths in prologue, only one MIN required
4718                for each since start index is zero.  */
4719             prologue_stmts += num_vectors;
4720
4721             /* If we have a non-zero partial load bias, we need one PLUS
4722                to adjust the load length.  */
4723             if (partial_load_store_bias != 0)
4724               body_stmts += 1;
4725
4726             /* Each may need two MINs and one MINUS to update lengths in body
4727                for next iteration.  */
4728             if (need_iterate_p)
4729               body_stmts += 3 * num_vectors;
4730           }
4731
4732       (void) add_stmt_cost (target_cost_data, prologue_stmts,
4733                             scalar_stmt, vect_prologue);
4734       (void) add_stmt_cost (target_cost_data, body_stmts,
4735                             scalar_stmt, vect_body);
4736     }
4737
4738   /* FORNOW: The scalar outside cost is incremented in one of the
4739      following ways:
4740
4741      1. The vectorizer checks for alignment and aliasing and generates
4742      a condition that allows dynamic vectorization.  A cost model
4743      check is ANDED with the versioning condition.  Hence scalar code
4744      path now has the added cost of the versioning check.
4745
4746        if (cost > th & versioning_check)
4747          jmp to vector code
4748
4749      Hence run-time scalar is incremented by not-taken branch cost.
4750
4751      2. The vectorizer then checks if a prologue is required.  If the
4752      cost model check was not done before during versioning, it has to
4753      be done before the prologue check.
4754
4755        if (cost <= th)
4756          prologue = scalar_iters
4757        if (prologue == 0)
4758          jmp to vector code
4759        else
4760          execute prologue
4761        if (prologue == num_iters)
4762          go to exit
4763
4764      Hence the run-time scalar cost is incremented by a taken branch,
4765      plus a not-taken branch, plus a taken branch cost.
4766
4767      3. The vectorizer then checks if an epilogue is required.  If the
4768      cost model check was not done before during prologue check, it
4769      has to be done with the epilogue check.
4770
4771        if (prologue == 0)
4772          jmp to vector code
4773        else
4774          execute prologue
4775        if (prologue == num_iters)
4776          go to exit
4777        vector code:
4778          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4779            jmp to epilogue
4780
4781      Hence the run-time scalar cost should be incremented by 2 taken
4782      branches.
4783
4784      TODO: The back end may reorder the BBS's differently and reverse
4785      conditions/branch directions.  Change the estimates below to
4786      something more reasonable.  */
4787
4788   /* If the number of iterations is known and we do not do versioning, we can
4789      decide whether to vectorize at compile time.  Hence the scalar version
4790      do not carry cost model guard costs.  */
4791   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4792       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4793     {
4794       /* Cost model check occurs at versioning.  */
4795       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4796         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4797       else
4798         {
4799           /* Cost model check occurs at prologue generation.  */
4800           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4801             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4802               + vect_get_stmt_cost (cond_branch_not_taken);
4803           /* Cost model check occurs at epilogue generation.  */
4804           else
4805             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4806         }
4807     }
4808
4809   /* Complete the target-specific cost calculations.  */
4810   finish_cost (loop_vinfo->vector_costs, loop_vinfo->scalar_costs,
4811                &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost,
4812                suggested_unroll_factor);
4813
4814   if (suggested_unroll_factor && *suggested_unroll_factor > 1
4815       && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
4816       && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
4817                     *suggested_unroll_factor,
4818                     LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
4819     {
4820       if (dump_enabled_p ())
4821         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4822                          "can't unroll as unrolled vectorization factor larger"
4823                          " than maximum vectorization factor: "
4824                          HOST_WIDE_INT_PRINT_UNSIGNED "\n",
4825                          LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
4826       *suggested_unroll_factor = 1;
4827     }
4828
4829   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4830
4831   if (dump_enabled_p ())
4832     {
4833       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4834       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
4835                    vec_inside_cost);
4836       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
4837                    vec_prologue_cost);
4838       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
4839                    vec_epilogue_cost);
4840       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
4841                    scalar_single_iter_cost);
4842       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
4843                    scalar_outside_cost);
4844       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
4845                    vec_outside_cost);
4846       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
4847                    peel_iters_prologue);
4848       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
4849                    peel_iters_epilogue);
4850     }
4851
4852   /* Calculate number of iterations required to make the vector version
4853      profitable, relative to the loop bodies only.  The following condition
4854      must hold true:
4855      SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4856      where
4857      SIC = scalar iteration cost, VIC = vector iteration cost,
4858      VOC = vector outside cost, VF = vectorization factor,
4859      NPEEL = prologue iterations + epilogue iterations,
4860      SOC = scalar outside cost for run time cost model check.  */
4861
4862   int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4863                           - vec_inside_cost);
4864   if (saving_per_viter <= 0)
4865     {
4866       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4867         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4868                     "vectorization did not happen for a simd loop");
4869
4870       if (dump_enabled_p ())
4871         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4872                          "cost model: the vector iteration cost = %d "
4873                          "divided by the scalar iteration cost = %d "
4874                          "is greater or equal to the vectorization factor = %d"
4875                          ".\n",
4876                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4877       *ret_min_profitable_niters = -1;
4878       *ret_min_profitable_estimate = -1;
4879       return;
4880     }
4881
4882   /* ??? The "if" arm is written to handle all cases; see below for what
4883      we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
4884   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4885     {
4886       /* Rewriting the condition above in terms of the number of
4887          vector iterations (vniters) rather than the number of
4888          scalar iterations (niters) gives:
4889
4890          SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
4891
4892          <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
4893
4894          For integer N, X and Y when X > 0:
4895
4896          N * X > Y <==> N >= (Y /[floor] X) + 1.  */
4897       int outside_overhead = (vec_outside_cost
4898                               - scalar_single_iter_cost * peel_iters_prologue
4899                               - scalar_single_iter_cost * peel_iters_epilogue
4900                               - scalar_outside_cost);
4901       /* We're only interested in cases that require at least one
4902          vector iteration.  */
4903       int min_vec_niters = 1;
4904       if (outside_overhead > 0)
4905         min_vec_niters = outside_overhead / saving_per_viter + 1;
4906
4907       if (dump_enabled_p ())
4908         dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
4909                      min_vec_niters);
4910
4911       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4912         {
4913           /* Now that we know the minimum number of vector iterations,
4914              find the minimum niters for which the scalar cost is larger:
4915
4916              SIC * niters > VIC * vniters + VOC - SOC
4917
4918              We know that the minimum niters is no more than
4919              vniters * VF + NPEEL, but it might be (and often is) less
4920              than that if a partial vector iteration is cheaper than the
4921              equivalent scalar code.  */
4922           int threshold = (vec_inside_cost * min_vec_niters
4923                            + vec_outside_cost
4924                            - scalar_outside_cost);
4925           if (threshold <= 0)
4926             min_profitable_iters = 1;
4927           else
4928             min_profitable_iters = threshold / scalar_single_iter_cost + 1;
4929         }
4930       else
4931         /* Convert the number of vector iterations into a number of
4932            scalar iterations.  */
4933         min_profitable_iters = (min_vec_niters * assumed_vf
4934                                 + peel_iters_prologue
4935                                 + peel_iters_epilogue);
4936     }
4937   else
4938     {
4939       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4940                               * assumed_vf
4941                               - vec_inside_cost * peel_iters_prologue
4942                               - vec_inside_cost * peel_iters_epilogue);
4943       if (min_profitable_iters <= 0)
4944         min_profitable_iters = 0;
4945       else
4946         {
4947           min_profitable_iters /= saving_per_viter;
4948
4949           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4950               <= (((int) vec_inside_cost * min_profitable_iters)
4951                   + (((int) vec_outside_cost - scalar_outside_cost)
4952                      * assumed_vf)))
4953             min_profitable_iters++;
4954         }
4955     }
4956
4957   if (dump_enabled_p ())
4958     dump_printf (MSG_NOTE,
4959                  "  Calculated minimum iters for profitability: %d\n",
4960                  min_profitable_iters);
4961
4962   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
4963       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4964     /* We want the vectorized loop to execute at least once.  */
4965     min_profitable_iters = assumed_vf + peel_iters_prologue;
4966   else if (min_profitable_iters < peel_iters_prologue)
4967     /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4968        vectorized loop executes at least once.  */
4969     min_profitable_iters = peel_iters_prologue;
4970
4971   if (dump_enabled_p ())
4972     dump_printf_loc (MSG_NOTE, vect_location,
4973                      "  Runtime profitability threshold = %d\n",
4974                      min_profitable_iters);
4975
4976   *ret_min_profitable_niters = min_profitable_iters;
4977
4978   /* Calculate number of iterations required to make the vector version
4979      profitable, relative to the loop bodies only.
4980
4981      Non-vectorized variant is SIC * niters and it must win over vector
4982      variant on the expected loop trip count.  The following condition must hold true:
4983      SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
4984
4985   if (vec_outside_cost <= 0)
4986     min_profitable_estimate = 0;
4987   /* ??? This "else if" arm is written to handle all cases; see below for
4988      what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
4989   else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4990     {
4991       /* This is a repeat of the code above, but with + SOC rather
4992          than - SOC.  */
4993       int outside_overhead = (vec_outside_cost
4994                               - scalar_single_iter_cost * peel_iters_prologue
4995                               - scalar_single_iter_cost * peel_iters_epilogue
4996                               + scalar_outside_cost);
4997       int min_vec_niters = 1;
4998       if (outside_overhead > 0)
4999         min_vec_niters = outside_overhead / saving_per_viter + 1;
5000
5001       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5002         {
5003           int threshold = (vec_inside_cost * min_vec_niters
5004                            + vec_outside_cost
5005                            + scalar_outside_cost);
5006           min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
5007         }
5008       else
5009         min_profitable_estimate = (min_vec_niters * assumed_vf
5010                                    + peel_iters_prologue
5011                                    + peel_iters_epilogue);
5012     }
5013   else
5014     {
5015       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
5016                                  * assumed_vf
5017                                  - vec_inside_cost * peel_iters_prologue
5018                                  - vec_inside_cost * peel_iters_epilogue)
5019                                  / ((scalar_single_iter_cost * assumed_vf)
5020                                    - vec_inside_cost);
5021     }
5022   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
5023   if (dump_enabled_p ())
5024     dump_printf_loc (MSG_NOTE, vect_location,
5025                      "  Static estimate profitability threshold = %d\n",
5026                      min_profitable_estimate);
5027
5028   *ret_min_profitable_estimate = min_profitable_estimate;
5029 }
5030
5031 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
5032    vector elements (not bits) for a vector with NELT elements.  */
5033 static void
5034 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
5035                               vec_perm_builder *sel)
5036 {
5037   /* The encoding is a single stepped pattern.  Any wrap-around is handled
5038      by vec_perm_indices.  */
5039   sel->new_vector (nelt, 1, 3);
5040   for (unsigned int i = 0; i < 3; i++)
5041     sel->quick_push (i + offset);
5042 }
5043
5044 /* Checks whether the target supports whole-vector shifts for vectors of mode
5045    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
5046    it supports vec_perm_const with masks for all necessary shift amounts.  */
5047 static bool
5048 have_whole_vector_shift (machine_mode mode)
5049 {
5050   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
5051     return true;
5052
5053   /* Variable-length vectors should be handled via the optab.  */
5054   unsigned int nelt;
5055   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5056     return false;
5057
5058   vec_perm_builder sel;
5059   vec_perm_indices indices;
5060   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
5061     {
5062       calc_vec_perm_mask_for_shift (i, nelt, &sel);
5063       indices.new_vector (sel, 2, nelt);
5064       if (!can_vec_perm_const_p (mode, mode, indices, false))
5065         return false;
5066     }
5067   return true;
5068 }
5069
5070 /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
5071    multiplication operands have differing signs and (b) we intend
5072    to emulate the operation using a series of signed DOT_PROD_EXPRs.
5073    See vect_emulate_mixed_dot_prod for the actual sequence used.  */
5074
5075 static bool
5076 vect_is_emulated_mixed_dot_prod (loop_vec_info loop_vinfo,
5077                                  stmt_vec_info stmt_info)
5078 {
5079   gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
5080   if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
5081     return false;
5082
5083   tree rhs1 = gimple_assign_rhs1 (assign);
5084   tree rhs2 = gimple_assign_rhs2 (assign);
5085   if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
5086     return false;
5087
5088   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5089   gcc_assert (reduc_info->is_reduc_info);
5090   return !directly_supported_p (DOT_PROD_EXPR,
5091                                 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info),
5092                                 optab_vector_mixed_sign);
5093 }
5094
5095 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
5096    functions. Design better to avoid maintenance issues.  */
5097
5098 /* Function vect_model_reduction_cost.
5099
5100    Models cost for a reduction operation, including the vector ops
5101    generated within the strip-mine loop in some cases, the initial
5102    definition before the loop, and the epilogue code that must be generated.  */
5103
5104 static void
5105 vect_model_reduction_cost (loop_vec_info loop_vinfo,
5106                            stmt_vec_info stmt_info, internal_fn reduc_fn,
5107                            vect_reduction_type reduction_type,
5108                            int ncopies, stmt_vector_for_cost *cost_vec)
5109 {
5110   int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
5111   tree vectype;
5112   machine_mode mode;
5113   class loop *loop = NULL;
5114
5115   if (loop_vinfo)
5116     loop = LOOP_VINFO_LOOP (loop_vinfo);
5117
5118   /* Condition reductions generate two reductions in the loop.  */
5119   if (reduction_type == COND_REDUCTION)
5120     ncopies *= 2;
5121
5122   vectype = STMT_VINFO_VECTYPE (stmt_info);
5123   mode = TYPE_MODE (vectype);
5124   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5125
5126   gimple_match_op op;
5127   if (!gimple_extract_op (orig_stmt_info->stmt, &op))
5128     gcc_unreachable ();
5129
5130   bool emulated_mixed_dot_prod
5131     = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
5132   if (reduction_type == EXTRACT_LAST_REDUCTION)
5133     /* No extra instructions are needed in the prologue.  The loop body
5134        operations are costed in vectorizable_condition.  */
5135     inside_cost = 0;
5136   else if (reduction_type == FOLD_LEFT_REDUCTION)
5137     {
5138       /* No extra instructions needed in the prologue.  */
5139       prologue_cost = 0;
5140
5141       if (reduc_fn != IFN_LAST)
5142         /* Count one reduction-like operation per vector.  */
5143         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
5144                                         stmt_info, 0, vect_body);
5145       else
5146         {
5147           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
5148           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
5149           inside_cost = record_stmt_cost (cost_vec, nelements,
5150                                           vec_to_scalar, stmt_info, 0,
5151                                           vect_body);
5152           inside_cost += record_stmt_cost (cost_vec, nelements,
5153                                            scalar_stmt, stmt_info, 0,
5154                                            vect_body);
5155         }
5156     }
5157   else
5158     {
5159       /* Add in the cost of the initial definitions.  */
5160       int prologue_stmts;
5161       if (reduction_type == COND_REDUCTION)
5162         /* For cond reductions we have four vectors: initial index, step,
5163            initial result of the data reduction, initial value of the index
5164            reduction.  */
5165         prologue_stmts = 4;
5166       else if (emulated_mixed_dot_prod)
5167         /* We need the initial reduction value and two invariants:
5168            one that contains the minimum signed value and one that
5169            contains half of its negative.  */
5170         prologue_stmts = 3;
5171       else
5172         prologue_stmts = 1;
5173       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
5174                                          scalar_to_vec, stmt_info, 0,
5175                                          vect_prologue);
5176     }
5177
5178   /* Determine cost of epilogue code.
5179
5180      We have a reduction operator that will reduce the vector in one statement.
5181      Also requires scalar extract.  */
5182
5183   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
5184     {
5185       if (reduc_fn != IFN_LAST)
5186         {
5187           if (reduction_type == COND_REDUCTION)
5188             {
5189               /* An EQ stmt and an COND_EXPR stmt.  */
5190               epilogue_cost += record_stmt_cost (cost_vec, 2,
5191                                                  vector_stmt, stmt_info, 0,
5192                                                  vect_epilogue);
5193               /* Reduction of the max index and a reduction of the found
5194                  values.  */
5195               epilogue_cost += record_stmt_cost (cost_vec, 2,
5196                                                  vec_to_scalar, stmt_info, 0,
5197                                                  vect_epilogue);
5198               /* A broadcast of the max value.  */
5199               epilogue_cost += record_stmt_cost (cost_vec, 1,
5200                                                  scalar_to_vec, stmt_info, 0,
5201                                                  vect_epilogue);
5202             }
5203           else
5204             {
5205               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
5206                                                  stmt_info, 0, vect_epilogue);
5207               epilogue_cost += record_stmt_cost (cost_vec, 1,
5208                                                  vec_to_scalar, stmt_info, 0,
5209                                                  vect_epilogue);
5210             }
5211         }
5212       else if (reduction_type == COND_REDUCTION)
5213         {
5214           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
5215           /* Extraction of scalar elements.  */
5216           epilogue_cost += record_stmt_cost (cost_vec,
5217                                              2 * estimated_nunits,
5218                                              vec_to_scalar, stmt_info, 0,
5219                                              vect_epilogue);
5220           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
5221           epilogue_cost += record_stmt_cost (cost_vec,
5222                                              2 * estimated_nunits - 3,
5223                                              scalar_stmt, stmt_info, 0,
5224                                              vect_epilogue);
5225         }
5226       else if (reduction_type == EXTRACT_LAST_REDUCTION
5227                || reduction_type == FOLD_LEFT_REDUCTION)
5228         /* No extra instructions need in the epilogue.  */
5229         ;
5230       else
5231         {
5232           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5233           tree bitsize = TYPE_SIZE (op.type);
5234           int element_bitsize = tree_to_uhwi (bitsize);
5235           int nelements = vec_size_in_bits / element_bitsize;
5236
5237           if (op.code == COND_EXPR)
5238             op.code = MAX_EXPR;
5239
5240           /* We have a whole vector shift available.  */
5241           if (VECTOR_MODE_P (mode)
5242               && directly_supported_p (op.code, vectype)
5243               && have_whole_vector_shift (mode))
5244             {
5245               /* Final reduction via vector shifts and the reduction operator.
5246                  Also requires scalar extract.  */
5247               epilogue_cost += record_stmt_cost (cost_vec,
5248                                                  exact_log2 (nelements) * 2,
5249                                                  vector_stmt, stmt_info, 0,
5250                                                  vect_epilogue);
5251               epilogue_cost += record_stmt_cost (cost_vec, 1,
5252                                                  vec_to_scalar, stmt_info, 0,
5253                                                  vect_epilogue);
5254             }
5255           else
5256             /* Use extracts and reduction op for final reduction.  For N
5257                elements, we have N extracts and N-1 reduction ops.  */
5258             epilogue_cost += record_stmt_cost (cost_vec,
5259                                                nelements + nelements - 1,
5260                                                vector_stmt, stmt_info, 0,
5261                                                vect_epilogue);
5262         }
5263     }
5264
5265   if (dump_enabled_p ())
5266     dump_printf (MSG_NOTE,
5267                  "vect_model_reduction_cost: inside_cost = %d, "
5268                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
5269                  prologue_cost, epilogue_cost);
5270 }
5271
5272 /* SEQ is a sequence of instructions that initialize the reduction
5273    described by REDUC_INFO.  Emit them in the appropriate place.  */
5274
5275 static void
5276 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
5277                                 stmt_vec_info reduc_info, gimple *seq)
5278 {
5279   if (reduc_info->reused_accumulator)
5280     {
5281       /* When reusing an accumulator from the main loop, we only need
5282          initialization instructions if the main loop can be skipped.
5283          In that case, emit the initialization instructions at the end
5284          of the guard block that does the skip.  */
5285       edge skip_edge = loop_vinfo->skip_main_loop_edge;
5286       gcc_assert (skip_edge);
5287       gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
5288       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
5289     }
5290   else
5291     {
5292       /* The normal case: emit the initialization instructions on the
5293          preheader edge.  */
5294       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5295       gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
5296     }
5297 }
5298
5299 /* Function get_initial_def_for_reduction
5300
5301    Input:
5302    REDUC_INFO - the info_for_reduction
5303    INIT_VAL - the initial value of the reduction variable
5304    NEUTRAL_OP - a value that has no effect on the reduction, as per
5305                 neutral_op_for_reduction
5306
5307    Output:
5308    Return a vector variable, initialized according to the operation that
5309         STMT_VINFO performs. This vector will be used as the initial value
5310         of the vector of partial results.
5311
5312    The value we need is a vector in which element 0 has value INIT_VAL
5313    and every other element has value NEUTRAL_OP.  */
5314
5315 static tree
5316 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
5317                                stmt_vec_info reduc_info,
5318                                tree init_val, tree neutral_op)
5319 {
5320   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5321   tree scalar_type = TREE_TYPE (init_val);
5322   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
5323   tree init_def;
5324   gimple_seq stmts = NULL;
5325
5326   gcc_assert (vectype);
5327
5328   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
5329               || SCALAR_FLOAT_TYPE_P (scalar_type));
5330
5331   gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
5332               || loop == (gimple_bb (reduc_info->stmt))->loop_father);
5333
5334   if (operand_equal_p (init_val, neutral_op))
5335     {
5336       /* If both elements are equal then the vector described above is
5337          just a splat.  */
5338       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5339       init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
5340     }
5341   else
5342     {
5343       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5344       init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
5345       if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
5346         {
5347           /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
5348              element 0.  */
5349           init_def = gimple_build_vector_from_val (&stmts, vectype,
5350                                                    neutral_op);
5351           init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
5352                                    vectype, init_def, init_val);
5353         }
5354       else
5355         {
5356           /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}.  */
5357           tree_vector_builder elts (vectype, 1, 2);
5358           elts.quick_push (init_val);
5359           elts.quick_push (neutral_op);
5360           init_def = gimple_build_vector (&stmts, &elts);
5361         }
5362     }
5363
5364   if (stmts)
5365     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
5366   return init_def;
5367 }
5368
5369 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
5370    which performs a reduction involving GROUP_SIZE scalar statements.
5371    NUMBER_OF_VECTORS is the number of vector defs to create.  If NEUTRAL_OP
5372    is nonnull, introducing extra elements of that value will not change the
5373    result.  */
5374
5375 static void
5376 get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
5377                                 stmt_vec_info reduc_info,
5378                                 vec<tree> *vec_oprnds,
5379                                 unsigned int number_of_vectors,
5380                                 unsigned int group_size, tree neutral_op)
5381 {
5382   vec<tree> &initial_values = reduc_info->reduc_initial_values;
5383   unsigned HOST_WIDE_INT nunits;
5384   unsigned j, number_of_places_left_in_vector;
5385   tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
5386   unsigned int i;
5387
5388   gcc_assert (group_size == initial_values.length () || neutral_op);
5389
5390   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5391      created vectors. It is greater than 1 if unrolling is performed.
5392
5393      For example, we have two scalar operands, s1 and s2 (e.g., group of
5394      strided accesses of size two), while NUNITS is four (i.e., four scalars
5395      of this type can be packed in a vector).  The output vector will contain
5396      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
5397      will be 2).
5398
5399      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
5400      vectors containing the operands.
5401
5402      For example, NUNITS is four as before, and the group size is 8
5403      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
5404      {s5, s6, s7, s8}.  */
5405
5406   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
5407     nunits = group_size;
5408
5409   number_of_places_left_in_vector = nunits;
5410   bool constant_p = true;
5411   tree_vector_builder elts (vector_type, nunits, 1);
5412   elts.quick_grow (nunits);
5413   gimple_seq ctor_seq = NULL;
5414   for (j = 0; j < nunits * number_of_vectors; ++j)
5415     {
5416       tree op;
5417       i = j % group_size;
5418
5419       /* Get the def before the loop.  In reduction chain we have only
5420          one initial value.  Else we have as many as PHIs in the group.  */
5421       if (i >= initial_values.length () || (j > i && neutral_op))
5422         op = neutral_op;
5423       else
5424         op = initial_values[i];
5425
5426       /* Create 'vect_ = {op0,op1,...,opn}'.  */
5427       number_of_places_left_in_vector--;
5428       elts[nunits - number_of_places_left_in_vector - 1] = op;
5429       if (!CONSTANT_CLASS_P (op))
5430         constant_p = false;
5431
5432       if (number_of_places_left_in_vector == 0)
5433         {
5434           tree init;
5435           if (constant_p && !neutral_op
5436               ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
5437               : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
5438             /* Build the vector directly from ELTS.  */
5439             init = gimple_build_vector (&ctor_seq, &elts);
5440           else if (neutral_op)
5441             {
5442               /* Build a vector of the neutral value and shift the
5443                  other elements into place.  */
5444               init = gimple_build_vector_from_val (&ctor_seq, vector_type,
5445                                                    neutral_op);
5446               int k = nunits;
5447               while (k > 0 && elts[k - 1] == neutral_op)
5448                 k -= 1;
5449               while (k > 0)
5450                 {
5451                   k -= 1;
5452                   init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
5453                                        vector_type, init, elts[k]);
5454                 }
5455             }
5456           else
5457             {
5458               /* First time round, duplicate ELTS to fill the
5459                  required number of vectors.  */
5460               duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
5461                                         elts, number_of_vectors, *vec_oprnds);
5462               break;
5463             }
5464           vec_oprnds->quick_push (init);
5465
5466           number_of_places_left_in_vector = nunits;
5467           elts.new_vector (vector_type, nunits, 1);
5468           elts.quick_grow (nunits);
5469           constant_p = true;
5470         }
5471     }
5472   if (ctor_seq != NULL)
5473     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
5474 }
5475
5476 /* For a statement STMT_INFO taking part in a reduction operation return
5477    the stmt_vec_info the meta information is stored on.  */
5478
5479 stmt_vec_info
5480 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
5481 {
5482   stmt_info = vect_orig_stmt (stmt_info);
5483   gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
5484   if (!is_a <gphi *> (stmt_info->stmt)
5485       || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
5486     stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5487   gphi *phi = as_a <gphi *> (stmt_info->stmt);
5488   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5489     {
5490       if (gimple_phi_num_args (phi) == 1)
5491         stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5492     }
5493   else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
5494     {
5495       stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
5496       if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
5497         stmt_info = info;
5498     }
5499   return stmt_info;
5500 }
5501
5502 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
5503    REDUC_INFO can build on.  Adjust REDUC_INFO and return true if so, otherwise
5504    return false.  */
5505
5506 static bool
5507 vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
5508                                 stmt_vec_info reduc_info)
5509 {
5510   loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
5511   if (!main_loop_vinfo)
5512     return false;
5513
5514   if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
5515     return false;
5516
5517   unsigned int num_phis = reduc_info->reduc_initial_values.length ();
5518   auto_vec<tree, 16> main_loop_results (num_phis);
5519   auto_vec<tree, 16> initial_values (num_phis);
5520   if (edge main_loop_edge = loop_vinfo->main_loop_edge)
5521     {
5522       /* The epilogue loop can be entered either from the main loop or
5523          from an earlier guard block.  */
5524       edge skip_edge = loop_vinfo->skip_main_loop_edge;
5525       for (tree incoming_value : reduc_info->reduc_initial_values)
5526         {
5527           /* Look for:
5528
5529                INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
5530                                     INITIAL_VALUE(guard block)>.  */
5531           gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
5532
5533           gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
5534           gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
5535
5536           tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
5537           tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
5538
5539           main_loop_results.quick_push (from_main_loop);
5540           initial_values.quick_push (from_skip);
5541         }
5542     }
5543   else
5544     /* The main loop dominates the epilogue loop.  */
5545     main_loop_results.splice (reduc_info->reduc_initial_values);
5546
5547   /* See if the main loop has the kind of accumulator we need.  */
5548   vect_reusable_accumulator *accumulator
5549     = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
5550   if (!accumulator
5551       || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
5552       || !std::equal (main_loop_results.begin (), main_loop_results.end (),
5553                       accumulator->reduc_info->reduc_scalar_results.begin ()))
5554     return false;
5555
5556   /* Handle the case where we can reduce wider vectors to narrower ones.  */
5557   tree vectype = STMT_VINFO_VECTYPE (reduc_info);
5558   tree old_vectype = TREE_TYPE (accumulator->reduc_input);
5559   unsigned HOST_WIDE_INT m;
5560   if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
5561                             TYPE_VECTOR_SUBPARTS (vectype), &m))
5562     return false;
5563   /* Check the intermediate vector types and operations are available.  */
5564   tree prev_vectype = old_vectype;
5565   poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
5566   while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
5567     {
5568       intermediate_nunits = exact_div (intermediate_nunits, 2);
5569       tree intermediate_vectype = get_related_vectype_for_scalar_type
5570         (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
5571       if (!intermediate_vectype
5572           || !directly_supported_p (STMT_VINFO_REDUC_CODE (reduc_info),
5573                                     intermediate_vectype)
5574           || !can_vec_extract (TYPE_MODE (prev_vectype),
5575                                TYPE_MODE (intermediate_vectype)))
5576         return false;
5577       prev_vectype = intermediate_vectype;
5578     }
5579
5580   /* Non-SLP reductions might apply an adjustment after the reduction
5581      operation, in order to simplify the initialization of the accumulator.
5582      If the epilogue loop carries on from where the main loop left off,
5583      it should apply the same adjustment to the final reduction result.
5584
5585      If the epilogue loop can also be entered directly (rather than via
5586      the main loop), we need to be able to handle that case in the same way,
5587      with the same adjustment.  (In principle we could add a PHI node
5588      to select the correct adjustment, but in practice that shouldn't be
5589      necessary.)  */
5590   tree main_adjustment
5591     = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5592   if (loop_vinfo->main_loop_edge && main_adjustment)
5593     {
5594       gcc_assert (num_phis == 1);
5595       tree initial_value = initial_values[0];
5596       /* Check that we can use INITIAL_VALUE as the adjustment and
5597          initialize the accumulator with a neutral value instead.  */
5598       if (!operand_equal_p (initial_value, main_adjustment))
5599         return false;
5600       code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5601       initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
5602                                                     code, initial_value);
5603     }
5604   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5605   reduc_info->reduc_initial_values.truncate (0);
5606   reduc_info->reduc_initial_values.splice (initial_values);
5607   reduc_info->reused_accumulator = accumulator;
5608   return true;
5609 }
5610
5611 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5612    CODE emitting stmts before GSI.  Returns a vector def of VECTYPE.  */
5613
5614 static tree
5615 vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5616                             gimple_seq *seq)
5617 {
5618   unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5619   unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5620   tree stype = TREE_TYPE (vectype);
5621   tree new_temp = vec_def;
5622   while (nunits > nunits1)
5623     {
5624       nunits /= 2;
5625       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5626                                                            stype, nunits);
5627       unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5628
5629       /* The target has to make sure we support lowpart/highpart
5630          extraction, either via direct vector extract or through
5631          an integer mode punning.  */
5632       tree dst1, dst2;
5633       gimple *epilog_stmt;
5634       if (convert_optab_handler (vec_extract_optab,
5635                                  TYPE_MODE (TREE_TYPE (new_temp)),
5636                                  TYPE_MODE (vectype1))
5637           != CODE_FOR_nothing)
5638         {
5639           /* Extract sub-vectors directly once vec_extract becomes
5640              a conversion optab.  */
5641           dst1 = make_ssa_name (vectype1);
5642           epilog_stmt
5643               = gimple_build_assign (dst1, BIT_FIELD_REF,
5644                                      build3 (BIT_FIELD_REF, vectype1,
5645                                              new_temp, TYPE_SIZE (vectype1),
5646                                              bitsize_int (0)));
5647           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5648           dst2 =  make_ssa_name (vectype1);
5649           epilog_stmt
5650               = gimple_build_assign (dst2, BIT_FIELD_REF,
5651                                      build3 (BIT_FIELD_REF, vectype1,
5652                                              new_temp, TYPE_SIZE (vectype1),
5653                                              bitsize_int (bitsize)));
5654           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5655         }
5656       else
5657         {
5658           /* Extract via punning to appropriately sized integer mode
5659              vector.  */
5660           tree eltype = build_nonstandard_integer_type (bitsize, 1);
5661           tree etype = build_vector_type (eltype, 2);
5662           gcc_assert (convert_optab_handler (vec_extract_optab,
5663                                              TYPE_MODE (etype),
5664                                              TYPE_MODE (eltype))
5665                       != CODE_FOR_nothing);
5666           tree tem = make_ssa_name (etype);
5667           epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5668                                              build1 (VIEW_CONVERT_EXPR,
5669                                                      etype, new_temp));
5670           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5671           new_temp = tem;
5672           tem = make_ssa_name (eltype);
5673           epilog_stmt
5674               = gimple_build_assign (tem, BIT_FIELD_REF,
5675                                      build3 (BIT_FIELD_REF, eltype,
5676                                              new_temp, TYPE_SIZE (eltype),
5677                                              bitsize_int (0)));
5678           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5679           dst1 = make_ssa_name (vectype1);
5680           epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5681                                              build1 (VIEW_CONVERT_EXPR,
5682                                                      vectype1, tem));
5683           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5684           tem = make_ssa_name (eltype);
5685           epilog_stmt
5686               = gimple_build_assign (tem, BIT_FIELD_REF,
5687                                      build3 (BIT_FIELD_REF, eltype,
5688                                              new_temp, TYPE_SIZE (eltype),
5689                                              bitsize_int (bitsize)));
5690           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5691           dst2 =  make_ssa_name (vectype1);
5692           epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5693                                              build1 (VIEW_CONVERT_EXPR,
5694                                                      vectype1, tem));
5695           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5696         }
5697
5698       new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5699     }
5700
5701   return new_temp;
5702 }
5703
5704 /* Function vect_create_epilog_for_reduction
5705
5706    Create code at the loop-epilog to finalize the result of a reduction
5707    computation.
5708
5709    STMT_INFO is the scalar reduction stmt that is being vectorized.
5710    SLP_NODE is an SLP node containing a group of reduction statements. The
5711      first one in this group is STMT_INFO.
5712    SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5713    REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5714      (counting from 0)
5715
5716    This function:
5717    1. Completes the reduction def-use cycles.
5718    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5719       by calling the function specified by REDUC_FN if available, or by
5720       other means (whole-vector shifts or a scalar loop).
5721       The function also creates a new phi node at the loop exit to preserve
5722       loop-closed form, as illustrated below.
5723
5724      The flow at the entry to this function:
5725
5726         loop:
5727           vec_def = phi <vec_init, null>        # REDUCTION_PHI
5728           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5729           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5730         loop_exit:
5731           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5732           use <s_out0>
5733           use <s_out0>
5734
5735      The above is transformed by this function into:
5736
5737         loop:
5738           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
5739           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5740           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5741         loop_exit:
5742           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5743           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5744           v_out2 = reduce <v_out1>
5745           s_out3 = extract_field <v_out2, 0>
5746           s_out4 = adjust_result <s_out3>
5747           use <s_out4>
5748           use <s_out4>
5749 */
5750
5751 static void
5752 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5753                                   stmt_vec_info stmt_info,
5754                                   slp_tree slp_node,
5755                                   slp_instance slp_node_instance)
5756 {
5757   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5758   gcc_assert (reduc_info->is_reduc_info);
5759   /* For double reductions we need to get at the inner loop reduction
5760      stmt which has the meta info attached.  Our stmt_info is that of the
5761      loop-closed PHI of the inner loop which we remember as
5762      def for the reduction PHI generation.  */
5763   bool double_reduc = false;
5764   stmt_vec_info rdef_info = stmt_info;
5765   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5766     {
5767       gcc_assert (!slp_node);
5768       double_reduc = true;
5769       stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5770                                             (stmt_info->stmt, 0));
5771       stmt_info = vect_stmt_to_vectorize (stmt_info);
5772     }
5773   gphi *reduc_def_stmt
5774     = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
5775   code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5776   internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
5777   tree vectype;
5778   machine_mode mode;
5779   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
5780   basic_block exit_bb;
5781   tree scalar_dest;
5782   tree scalar_type;
5783   gimple *new_phi = NULL, *phi;
5784   gimple_stmt_iterator exit_gsi;
5785   tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5786   gimple *epilog_stmt = NULL;
5787   gimple *exit_phi;
5788   tree bitsize;
5789   tree def;
5790   tree orig_name, scalar_result;
5791   imm_use_iterator imm_iter, phi_imm_iter;
5792   use_operand_p use_p, phi_use_p;
5793   gimple *use_stmt;
5794   auto_vec<tree> reduc_inputs;
5795   int j, i;
5796   vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
5797   unsigned int group_size = 1, k;
5798   auto_vec<gimple *> phis;
5799   /* SLP reduction without reduction chain, e.g.,
5800      # a1 = phi <a2, a0>
5801      # b1 = phi <b2, b0>
5802      a2 = operation (a1)
5803      b2 = operation (b1)  */
5804   bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5805   bool direct_slp_reduc;
5806   tree induction_index = NULL_TREE;
5807
5808   if (slp_node)
5809     group_size = SLP_TREE_LANES (slp_node);
5810
5811   if (nested_in_vect_loop_p (loop, stmt_info))
5812     {
5813       outer_loop = loop;
5814       loop = loop->inner;
5815       gcc_assert (!slp_node && double_reduc);
5816     }
5817
5818   vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
5819   gcc_assert (vectype);
5820   mode = TYPE_MODE (vectype);
5821
5822   tree induc_val = NULL_TREE;
5823   tree adjustment_def = NULL;
5824   if (slp_node)
5825     ;
5826   else
5827     {
5828       /* Optimize: for induction condition reduction, if we can't use zero
5829          for induc_val, use initial_def.  */
5830       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5831         induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
5832       else if (double_reduc)
5833         ;
5834       else
5835         adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
5836     }
5837
5838   stmt_vec_info single_live_out_stmt[] = { stmt_info };
5839   array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
5840   if (slp_reduc)
5841     /* All statements produce live-out values.  */
5842     live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
5843   else if (slp_node)
5844     {
5845       /* The last statement in the reduction chain produces the live-out
5846          value.  Note SLP optimization can shuffle scalar stmts to
5847          optimize permutations so we have to search for the last stmt.  */
5848       for (k = 0; k < group_size; ++k)
5849         if (!REDUC_GROUP_NEXT_ELEMENT (SLP_TREE_SCALAR_STMTS (slp_node)[k]))
5850           {
5851             single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5852             break;
5853           }
5854     }
5855
5856   unsigned vec_num;
5857   int ncopies;
5858   if (slp_node)
5859     {
5860       vec_num = SLP_TREE_VEC_DEFS (slp_node_instance->reduc_phis).length ();
5861       ncopies = 1;
5862     }
5863   else
5864     {
5865       stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
5866       vec_num = 1;
5867       ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
5868     }
5869
5870   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5871      which is updated with the current index of the loop for every match of
5872      the original loop's cond_expr (VEC_STMT).  This results in a vector
5873      containing the last time the condition passed for that vector lane.
5874      The first match will be a 1 to allow 0 to be used for non-matching
5875      indexes.  If there are no matches at all then the vector will be all
5876      zeroes.
5877
5878      PR92772: This algorithm is broken for architectures that support
5879      masked vectors, but do not provide fold_extract_last.  */
5880   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
5881     {
5882       auto_vec<std::pair<tree, bool>, 2> ccompares;
5883       stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
5884       cond_info = vect_stmt_to_vectorize (cond_info);
5885       while (cond_info != reduc_info)
5886         {
5887           if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
5888             {
5889               gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
5890               gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
5891               ccompares.safe_push
5892                 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
5893                                  STMT_VINFO_REDUC_IDX (cond_info) == 2));
5894             }
5895           cond_info
5896             = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
5897                                                  1 + STMT_VINFO_REDUC_IDX
5898                                                         (cond_info)));
5899           cond_info = vect_stmt_to_vectorize (cond_info);
5900         }
5901       gcc_assert (ccompares.length () != 0);
5902
5903       tree indx_before_incr, indx_after_incr;
5904       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
5905       int scalar_precision
5906         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
5907       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
5908       tree cr_index_vector_type = get_related_vectype_for_scalar_type
5909         (TYPE_MODE (vectype), cr_index_scalar_type,
5910          TYPE_VECTOR_SUBPARTS (vectype));
5911
5912       /* First we create a simple vector induction variable which starts
5913          with the values {1,2,3,...} (SERIES_VECT) and increments by the
5914          vector size (STEP).  */
5915
5916       /* Create a {1,2,3,...} vector.  */
5917       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
5918
5919       /* Create a vector of the step value.  */
5920       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
5921       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
5922
5923       /* Create an induction variable.  */
5924       gimple_stmt_iterator incr_gsi;
5925       bool insert_after;
5926       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5927       create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, &incr_gsi,
5928                  insert_after, &indx_before_incr, &indx_after_incr);
5929
5930       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
5931          filled with zeros (VEC_ZERO).  */
5932
5933       /* Create a vector of 0s.  */
5934       tree zero = build_zero_cst (cr_index_scalar_type);
5935       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
5936
5937       /* Create a vector phi node.  */
5938       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
5939       new_phi = create_phi_node (new_phi_tree, loop->header);
5940       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
5941                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
5942
5943       /* Now take the condition from the loops original cond_exprs
5944          and produce a new cond_exprs (INDEX_COND_EXPR) which for
5945          every match uses values from the induction variable
5946          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5947          (NEW_PHI_TREE).
5948          Finally, we update the phi (NEW_PHI_TREE) to take the value of
5949          the new cond_expr (INDEX_COND_EXPR).  */
5950       gimple_seq stmts = NULL;
5951       for (int i = ccompares.length () - 1; i != -1; --i)
5952         {
5953           tree ccompare = ccompares[i].first;
5954           if (ccompares[i].second)
5955             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5956                                          cr_index_vector_type,
5957                                          ccompare,
5958                                          indx_before_incr, new_phi_tree);
5959           else
5960             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5961                                          cr_index_vector_type,
5962                                          ccompare,
5963                                          new_phi_tree, indx_before_incr);
5964         }
5965       gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
5966
5967       /* Update the phi with the vec cond.  */
5968       induction_index = new_phi_tree;
5969       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
5970                    loop_latch_edge (loop), UNKNOWN_LOCATION);
5971     }
5972
5973   /* 2. Create epilog code.
5974         The reduction epilog code operates across the elements of the vector
5975         of partial results computed by the vectorized loop.
5976         The reduction epilog code consists of:
5977
5978         step 1: compute the scalar result in a vector (v_out2)
5979         step 2: extract the scalar result (s_out3) from the vector (v_out2)
5980         step 3: adjust the scalar result (s_out3) if needed.
5981
5982         Step 1 can be accomplished using one the following three schemes:
5983           (scheme 1) using reduc_fn, if available.
5984           (scheme 2) using whole-vector shifts, if available.
5985           (scheme 3) using a scalar loop. In this case steps 1+2 above are
5986                      combined.
5987
5988           The overall epilog code looks like this:
5989
5990           s_out0 = phi <s_loop>         # original EXIT_PHI
5991           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
5992           v_out2 = reduce <v_out1>              # step 1
5993           s_out3 = extract_field <v_out2, 0>    # step 2
5994           s_out4 = adjust_result <s_out3>       # step 3
5995
5996           (step 3 is optional, and steps 1 and 2 may be combined).
5997           Lastly, the uses of s_out0 are replaced by s_out4.  */
5998
5999
6000   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
6001          v_out1 = phi <VECT_DEF>
6002          Store them in NEW_PHIS.  */
6003   if (double_reduc)
6004     loop = outer_loop;
6005   exit_bb = single_exit (loop)->dest;
6006   exit_gsi = gsi_after_labels (exit_bb);
6007   reduc_inputs.create (slp_node ? vec_num : ncopies);
6008   for (unsigned i = 0; i < vec_num; i++)
6009     {
6010       gimple_seq stmts = NULL;
6011       if (slp_node)
6012         def = vect_get_slp_vect_def (slp_node, i);
6013       else
6014         def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
6015       for (j = 0; j < ncopies; j++)
6016         {
6017           tree new_def = copy_ssa_name (def);
6018           phi = create_phi_node (new_def, exit_bb);
6019           if (j)
6020             def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
6021           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
6022           new_def = gimple_convert (&stmts, vectype, new_def);
6023           reduc_inputs.quick_push (new_def);
6024         }
6025       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6026     }
6027
6028   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
6029          (i.e. when reduc_fn is not available) and in the final adjustment
6030          code (if needed).  Also get the original scalar reduction variable as
6031          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
6032          represents a reduction pattern), the tree-code and scalar-def are
6033          taken from the original stmt that the pattern-stmt (STMT) replaces.
6034          Otherwise (it is a regular reduction) - the tree-code and scalar-def
6035          are taken from STMT.  */
6036
6037   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6038   if (orig_stmt_info != stmt_info)
6039     {
6040       /* Reduction pattern  */
6041       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6042       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
6043     }
6044
6045   scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
6046   scalar_type = TREE_TYPE (scalar_dest);
6047   scalar_results.truncate (0);
6048   scalar_results.reserve_exact (group_size);
6049   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
6050   bitsize = TYPE_SIZE (scalar_type);
6051
6052   /* True if we should implement SLP_REDUC using native reduction operations
6053      instead of scalar operations.  */
6054   direct_slp_reduc = (reduc_fn != IFN_LAST
6055                       && slp_reduc
6056                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
6057
6058   /* In case of reduction chain, e.g.,
6059      # a1 = phi <a3, a0>
6060      a2 = operation (a1)
6061      a3 = operation (a2),
6062
6063      we may end up with more than one vector result.  Here we reduce them
6064      to one vector.
6065
6066      The same is true for a SLP reduction, e.g.,
6067      # a1 = phi <a2, a0>
6068      # b1 = phi <b2, b0>
6069      a2 = operation (a1)
6070      b2 = operation (a2),
6071
6072      where we can end up with more than one vector as well.  We can
6073      easily accumulate vectors when the number of vector elements is
6074      a multiple of the SLP group size.
6075
6076      The same is true if we couldn't use a single defuse cycle.  */
6077   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6078       || direct_slp_reduc
6079       || (slp_reduc
6080           && constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype), group_size))
6081       || ncopies > 1)
6082     {
6083       gimple_seq stmts = NULL;
6084       tree single_input = reduc_inputs[0];
6085       for (k = 1; k < reduc_inputs.length (); k++)
6086         single_input = gimple_build (&stmts, code, vectype,
6087                                      single_input, reduc_inputs[k]);
6088       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6089
6090       reduc_inputs.truncate (0);
6091       reduc_inputs.safe_push (single_input);
6092     }
6093
6094   tree orig_reduc_input = reduc_inputs[0];
6095
6096   /* If this loop is an epilogue loop that can be skipped after the
6097      main loop, we can only share a reduction operation between the
6098      main loop and the epilogue if we put it at the target of the
6099      skip edge.
6100
6101      We can still reuse accumulators if this check fails.  Doing so has
6102      the minor(?) benefit of making the epilogue loop's scalar result
6103      independent of the main loop's scalar result.  */
6104   bool unify_with_main_loop_p = false;
6105   if (reduc_info->reused_accumulator
6106       && loop_vinfo->skip_this_loop_edge
6107       && single_succ_p (exit_bb)
6108       && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
6109     {
6110       unify_with_main_loop_p = true;
6111
6112       basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
6113       reduc_inputs[0] = make_ssa_name (vectype);
6114       gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
6115       add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
6116                    UNKNOWN_LOCATION);
6117       add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
6118                    loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
6119       exit_gsi = gsi_after_labels (reduc_block);
6120     }
6121
6122   /* Shouldn't be used beyond this point.  */
6123   exit_bb = nullptr;
6124
6125   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6126       && reduc_fn != IFN_LAST)
6127     {
6128       /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
6129          various data values where the condition matched and another vector
6130          (INDUCTION_INDEX) containing all the indexes of those matches.  We
6131          need to extract the last matching index (which will be the index with
6132          highest value) and use this to index into the data vector.
6133          For the case where there were no matches, the data vector will contain
6134          all default values and the index vector will be all zeros.  */
6135
6136       /* Get various versions of the type of the vector of indexes.  */
6137       tree index_vec_type = TREE_TYPE (induction_index);
6138       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
6139       tree index_scalar_type = TREE_TYPE (index_vec_type);
6140       tree index_vec_cmp_type = truth_type_for (index_vec_type);
6141
6142       /* Get an unsigned integer version of the type of the data vector.  */
6143       int scalar_precision
6144         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6145       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
6146       tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
6147                                                 vectype);
6148
6149       /* First we need to create a vector (ZERO_VEC) of zeros and another
6150          vector (MAX_INDEX_VEC) filled with the last matching index, which we
6151          can create using a MAX reduction and then expanding.
6152          In the case where the loop never made any matches, the max index will
6153          be zero.  */
6154
6155       /* Vector of {0, 0, 0,...}.  */
6156       tree zero_vec = build_zero_cst (vectype);
6157
6158       /* Find maximum value from the vector of found indexes.  */
6159       tree max_index = make_ssa_name (index_scalar_type);
6160       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6161                                                           1, induction_index);
6162       gimple_call_set_lhs (max_index_stmt, max_index);
6163       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
6164
6165       /* Vector of {max_index, max_index, max_index,...}.  */
6166       tree max_index_vec = make_ssa_name (index_vec_type);
6167       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
6168                                                       max_index);
6169       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
6170                                                         max_index_vec_rhs);
6171       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
6172
6173       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
6174          with the vector (INDUCTION_INDEX) of found indexes, choosing values
6175          from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
6176          otherwise.  Only one value should match, resulting in a vector
6177          (VEC_COND) with one data value and the rest zeros.
6178          In the case where the loop never made any matches, every index will
6179          match, resulting in a vector with all data values (which will all be
6180          the default value).  */
6181
6182       /* Compare the max index vector to the vector of found indexes to find
6183          the position of the max value.  */
6184       tree vec_compare = make_ssa_name (index_vec_cmp_type);
6185       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
6186                                                       induction_index,
6187                                                       max_index_vec);
6188       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
6189
6190       /* Use the compare to choose either values from the data vector or
6191          zero.  */
6192       tree vec_cond = make_ssa_name (vectype);
6193       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
6194                                                    vec_compare,
6195                                                    reduc_inputs[0],
6196                                                    zero_vec);
6197       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
6198
6199       /* Finally we need to extract the data value from the vector (VEC_COND)
6200          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
6201          reduction, but because this doesn't exist, we can use a MAX reduction
6202          instead.  The data value might be signed or a float so we need to cast
6203          it first.
6204          In the case where the loop never made any matches, the data values are
6205          all identical, and so will reduce down correctly.  */
6206
6207       /* Make the matched data values unsigned.  */
6208       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
6209       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
6210                                        vec_cond);
6211       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
6212                                                         VIEW_CONVERT_EXPR,
6213                                                         vec_cond_cast_rhs);
6214       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
6215
6216       /* Reduce down to a scalar value.  */
6217       tree data_reduc = make_ssa_name (scalar_type_unsigned);
6218       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6219                                                            1, vec_cond_cast);
6220       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
6221       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
6222
6223       /* Convert the reduced value back to the result type and set as the
6224          result.  */
6225       gimple_seq stmts = NULL;
6226       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
6227                                data_reduc);
6228       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6229       scalar_results.safe_push (new_temp);
6230     }
6231   else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6232            && reduc_fn == IFN_LAST)
6233     {
6234       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
6235          idx = 0;
6236          idx_val = induction_index[0];
6237          val = data_reduc[0];
6238          for (idx = 0, val = init, i = 0; i < nelts; ++i)
6239            if (induction_index[i] > idx_val)
6240              val = data_reduc[i], idx_val = induction_index[i];
6241          return val;  */
6242
6243       tree data_eltype = TREE_TYPE (vectype);
6244       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
6245       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
6246       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
6247       /* Enforced by vectorizable_reduction, which ensures we have target
6248          support before allowing a conditional reduction on variable-length
6249          vectors.  */
6250       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
6251       tree idx_val = NULL_TREE, val = NULL_TREE;
6252       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
6253         {
6254           tree old_idx_val = idx_val;
6255           tree old_val = val;
6256           idx_val = make_ssa_name (idx_eltype);
6257           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
6258                                              build3 (BIT_FIELD_REF, idx_eltype,
6259                                                      induction_index,
6260                                                      bitsize_int (el_size),
6261                                                      bitsize_int (off)));
6262           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6263           val = make_ssa_name (data_eltype);
6264           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
6265                                              build3 (BIT_FIELD_REF,
6266                                                      data_eltype,
6267                                                      reduc_inputs[0],
6268                                                      bitsize_int (el_size),
6269                                                      bitsize_int (off)));
6270           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6271           if (off != 0)
6272             {
6273               tree new_idx_val = idx_val;
6274               if (off != v_size - el_size)
6275                 {
6276                   new_idx_val = make_ssa_name (idx_eltype);
6277                   epilog_stmt = gimple_build_assign (new_idx_val,
6278                                                      MAX_EXPR, idx_val,
6279                                                      old_idx_val);
6280                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6281                 }
6282               tree cond = make_ssa_name (boolean_type_node);
6283               epilog_stmt = gimple_build_assign (cond, GT_EXPR,
6284                                                  idx_val, old_idx_val);
6285               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6286               tree new_val = make_ssa_name (data_eltype);
6287               epilog_stmt = gimple_build_assign (new_val, COND_EXPR,
6288                                                  cond, val, old_val);
6289               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6290               idx_val = new_idx_val;
6291               val = new_val;
6292             }
6293         }
6294       /* Convert the reduced value back to the result type and set as the
6295          result.  */
6296       gimple_seq stmts = NULL;
6297       val = gimple_convert (&stmts, scalar_type, val);
6298       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6299       scalar_results.safe_push (val);
6300     }
6301
6302   /* 2.3 Create the reduction code, using one of the three schemes described
6303          above. In SLP we simply need to extract all the elements from the
6304          vector (without reducing them), so we use scalar shifts.  */
6305   else if (reduc_fn != IFN_LAST && !slp_reduc)
6306     {
6307       tree tmp;
6308       tree vec_elem_type;
6309
6310       /* Case 1:  Create:
6311          v_out2 = reduc_expr <v_out1>  */
6312
6313       if (dump_enabled_p ())
6314         dump_printf_loc (MSG_NOTE, vect_location,
6315                          "Reduce using direct vector reduction.\n");
6316
6317       gimple_seq stmts = NULL;
6318       vec_elem_type = TREE_TYPE (vectype);
6319       new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
6320                                vec_elem_type, reduc_inputs[0]);
6321       new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6322       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6323
6324       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6325           && induc_val)
6326         {
6327           /* Earlier we set the initial value to be a vector if induc_val
6328              values.  Check the result and if it is induc_val then replace
6329              with the original initial value, unless induc_val is
6330              the same as initial_def already.  */
6331           tree zcompare = make_ssa_name (boolean_type_node);
6332           epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
6333                                              new_temp, induc_val);
6334           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6335           tree initial_def = reduc_info->reduc_initial_values[0];
6336           tmp = make_ssa_name (new_scalar_dest);
6337           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6338                                              initial_def, new_temp);
6339           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6340           new_temp = tmp;
6341         }
6342
6343       scalar_results.safe_push (new_temp);
6344     }
6345   else if (direct_slp_reduc)
6346     {
6347       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
6348          with the elements for other SLP statements replaced with the
6349          neutral value.  We can then do a normal reduction on each vector.  */
6350
6351       /* Enforced by vectorizable_reduction.  */
6352       gcc_assert (reduc_inputs.length () == 1);
6353       gcc_assert (pow2p_hwi (group_size));
6354
6355       gimple_seq seq = NULL;
6356
6357       /* Build a vector {0, 1, 2, ...}, with the same number of elements
6358          and the same element size as VECTYPE.  */
6359       tree index = build_index_vector (vectype, 0, 1);
6360       tree index_type = TREE_TYPE (index);
6361       tree index_elt_type = TREE_TYPE (index_type);
6362       tree mask_type = truth_type_for (index_type);
6363
6364       /* Create a vector that, for each element, identifies which of
6365          the REDUC_GROUP_SIZE results should use it.  */
6366       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
6367       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
6368                             build_vector_from_val (index_type, index_mask));
6369
6370       /* Get a neutral vector value.  This is simply a splat of the neutral
6371          scalar value if we have one, otherwise the initial scalar value
6372          is itself a neutral value.  */
6373       tree vector_identity = NULL_TREE;
6374       tree neutral_op = NULL_TREE;
6375       if (slp_node)
6376         {
6377           tree initial_value = NULL_TREE;
6378           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6379             initial_value = reduc_info->reduc_initial_values[0];
6380           neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
6381                                                  initial_value);
6382         }
6383       if (neutral_op)
6384         vector_identity = gimple_build_vector_from_val (&seq, vectype,
6385                                                         neutral_op);
6386       for (unsigned int i = 0; i < group_size; ++i)
6387         {
6388           /* If there's no univeral neutral value, we can use the
6389              initial scalar value from the original PHI.  This is used
6390              for MIN and MAX reduction, for example.  */
6391           if (!neutral_op)
6392             {
6393               tree scalar_value = reduc_info->reduc_initial_values[i];
6394               scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
6395                                              scalar_value);
6396               vector_identity = gimple_build_vector_from_val (&seq, vectype,
6397                                                               scalar_value);
6398             }
6399
6400           /* Calculate the equivalent of:
6401
6402              sel[j] = (index[j] == i);
6403
6404              which selects the elements of REDUC_INPUTS[0] that should
6405              be included in the result.  */
6406           tree compare_val = build_int_cst (index_elt_type, i);
6407           compare_val = build_vector_from_val (index_type, compare_val);
6408           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
6409                                    index, compare_val);
6410
6411           /* Calculate the equivalent of:
6412
6413              vec = seq ? reduc_inputs[0] : vector_identity;
6414
6415              VEC is now suitable for a full vector reduction.  */
6416           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
6417                                    sel, reduc_inputs[0], vector_identity);
6418
6419           /* Do the reduction and convert it to the appropriate type.  */
6420           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
6421                                       TREE_TYPE (vectype), vec);
6422           scalar = gimple_convert (&seq, scalar_type, scalar);
6423           scalar_results.safe_push (scalar);
6424         }
6425       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
6426     }
6427   else
6428     {
6429       bool reduce_with_shift;
6430       tree vec_temp;
6431
6432       gcc_assert (slp_reduc || reduc_inputs.length () == 1);
6433
6434       /* See if the target wants to do the final (shift) reduction
6435          in a vector mode of smaller size and first reduce upper/lower
6436          halves against each other.  */
6437       enum machine_mode mode1 = mode;
6438       tree stype = TREE_TYPE (vectype);
6439       unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6440       unsigned nunits1 = nunits;
6441       if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
6442           && reduc_inputs.length () == 1)
6443         {
6444           nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6445           /* For SLP reductions we have to make sure lanes match up, but
6446              since we're doing individual element final reduction reducing
6447              vector width here is even more important.
6448              ???  We can also separate lanes with permutes, for the common
6449              case of power-of-two group-size odd/even extracts would work.  */
6450           if (slp_reduc && nunits != nunits1)
6451             {
6452               nunits1 = least_common_multiple (nunits1, group_size);
6453               gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
6454             }
6455         }
6456       if (!slp_reduc
6457           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
6458         nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6459
6460       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
6461                                                            stype, nunits1);
6462       reduce_with_shift = have_whole_vector_shift (mode1);
6463       if (!VECTOR_MODE_P (mode1)
6464           || !directly_supported_p (code, vectype1))
6465         reduce_with_shift = false;
6466
6467       /* First reduce the vector to the desired vector size we should
6468          do shift reduction on by combining upper and lower halves.  */
6469       gimple_seq stmts = NULL;
6470       new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
6471                                              code, &stmts);
6472       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6473       reduc_inputs[0] = new_temp;
6474
6475       if (reduce_with_shift && !slp_reduc)
6476         {
6477           int element_bitsize = tree_to_uhwi (bitsize);
6478           /* Enforced by vectorizable_reduction, which disallows SLP reductions
6479              for variable-length vectors and also requires direct target support
6480              for loop reductions.  */
6481           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6482           int nelements = vec_size_in_bits / element_bitsize;
6483           vec_perm_builder sel;
6484           vec_perm_indices indices;
6485
6486           int elt_offset;
6487
6488           tree zero_vec = build_zero_cst (vectype1);
6489           /* Case 2: Create:
6490              for (offset = nelements/2; offset >= 1; offset/=2)
6491                 {
6492                   Create:  va' = vec_shift <va, offset>
6493                   Create:  va = vop <va, va'>
6494                 }  */
6495
6496           tree rhs;
6497
6498           if (dump_enabled_p ())
6499             dump_printf_loc (MSG_NOTE, vect_location,
6500                              "Reduce using vector shifts\n");
6501
6502           gimple_seq stmts = NULL;
6503           new_temp = gimple_convert (&stmts, vectype1, new_temp);
6504           for (elt_offset = nelements / 2;
6505                elt_offset >= 1;
6506                elt_offset /= 2)
6507             {
6508               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
6509               indices.new_vector (sel, 2, nelements);
6510               tree mask = vect_gen_perm_mask_any (vectype1, indices);
6511               new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
6512                                        new_temp, zero_vec, mask);
6513               new_temp = gimple_build (&stmts, code,
6514                                        vectype1, new_name, new_temp);
6515             }
6516           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6517
6518           /* 2.4  Extract the final scalar result.  Create:
6519              s_out3 = extract_field <v_out2, bitpos>  */
6520
6521           if (dump_enabled_p ())
6522             dump_printf_loc (MSG_NOTE, vect_location,
6523                              "extract scalar result\n");
6524
6525           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
6526                         bitsize, bitsize_zero_node);
6527           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
6528           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
6529           gimple_assign_set_lhs (epilog_stmt, new_temp);
6530           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6531           scalar_results.safe_push (new_temp);
6532         }
6533       else
6534         {
6535           /* Case 3: Create:
6536              s = extract_field <v_out2, 0>
6537              for (offset = element_size;
6538                   offset < vector_size;
6539                   offset += element_size;)
6540                {
6541                  Create:  s' = extract_field <v_out2, offset>
6542                  Create:  s = op <s, s'>  // For non SLP cases
6543                }  */
6544
6545           if (dump_enabled_p ())
6546             dump_printf_loc (MSG_NOTE, vect_location,
6547                              "Reduce using scalar code.\n");
6548
6549           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6550           int element_bitsize = tree_to_uhwi (bitsize);
6551           tree compute_type = TREE_TYPE (vectype);
6552           gimple_seq stmts = NULL;
6553           FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
6554             {
6555               int bit_offset;
6556               new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
6557                                        vec_temp, bitsize, bitsize_zero_node);
6558
6559               /* In SLP we don't need to apply reduction operation, so we just
6560                  collect s' values in SCALAR_RESULTS.  */
6561               if (slp_reduc)
6562                 scalar_results.safe_push (new_temp);
6563
6564               for (bit_offset = element_bitsize;
6565                    bit_offset < vec_size_in_bits;
6566                    bit_offset += element_bitsize)
6567                 {
6568                   tree bitpos = bitsize_int (bit_offset);
6569                   new_name = gimple_build (&stmts, BIT_FIELD_REF,
6570                                            compute_type, vec_temp,
6571                                            bitsize, bitpos);
6572                   if (slp_reduc)
6573                     {
6574                       /* In SLP we don't need to apply reduction operation, so
6575                          we just collect s' values in SCALAR_RESULTS.  */
6576                       new_temp = new_name;
6577                       scalar_results.safe_push (new_name);
6578                     }
6579                   else
6580                     new_temp = gimple_build (&stmts, code, compute_type,
6581                                              new_name, new_temp);
6582                 }
6583             }
6584
6585           /* The only case where we need to reduce scalar results in SLP, is
6586              unrolling.  If the size of SCALAR_RESULTS is greater than
6587              REDUC_GROUP_SIZE, we reduce them combining elements modulo
6588              REDUC_GROUP_SIZE.  */
6589           if (slp_reduc)
6590             {
6591               tree res, first_res, new_res;
6592
6593               /* Reduce multiple scalar results in case of SLP unrolling.  */
6594               for (j = group_size; scalar_results.iterate (j, &res);
6595                    j++)
6596                 {
6597                   first_res = scalar_results[j % group_size];
6598                   new_res = gimple_build (&stmts, code, compute_type,
6599                                           first_res, res);
6600                   scalar_results[j % group_size] = new_res;
6601                 }
6602               scalar_results.truncate (group_size);
6603               for (k = 0; k < group_size; k++)
6604                 scalar_results[k] = gimple_convert (&stmts, scalar_type,
6605                                                     scalar_results[k]);
6606             }
6607           else
6608             {
6609               /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
6610               new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6611               scalar_results.safe_push (new_temp);
6612             }
6613
6614           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6615         }
6616
6617       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6618           && induc_val)
6619         {
6620           /* Earlier we set the initial value to be a vector if induc_val
6621              values.  Check the result and if it is induc_val then replace
6622              with the original initial value, unless induc_val is
6623              the same as initial_def already.  */
6624           tree zcompare = make_ssa_name (boolean_type_node);
6625           epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR, new_temp,
6626                                              induc_val);
6627           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6628           tree initial_def = reduc_info->reduc_initial_values[0];
6629           tree tmp = make_ssa_name (new_scalar_dest);
6630           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6631                                              initial_def, new_temp);
6632           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6633           scalar_results[0] = tmp;
6634         }
6635     }
6636
6637   /* 2.5 Adjust the final result by the initial value of the reduction
6638          variable. (When such adjustment is not needed, then
6639          'adjustment_def' is zero).  For example, if code is PLUS we create:
6640          new_temp = loop_exit_def + adjustment_def  */
6641
6642   if (adjustment_def)
6643     {
6644       gcc_assert (!slp_reduc);
6645       gimple_seq stmts = NULL;
6646       if (double_reduc)
6647         {
6648           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6649           adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6650           new_temp = gimple_build (&stmts, code, vectype,
6651                                    reduc_inputs[0], adjustment_def);
6652         }
6653       else
6654         {
6655           new_temp = scalar_results[0];
6656           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6657           adjustment_def = gimple_convert (&stmts, TREE_TYPE (vectype),
6658                                            adjustment_def);
6659           new_temp = gimple_convert (&stmts, TREE_TYPE (vectype), new_temp);
6660           new_temp = gimple_build (&stmts, code, TREE_TYPE (vectype),
6661                                    new_temp, adjustment_def);
6662           new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6663         }
6664
6665       epilog_stmt = gimple_seq_last_stmt (stmts);
6666       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6667       scalar_results[0] = new_temp;
6668     }
6669
6670   /* Record this operation if it could be reused by the epilogue loop.  */
6671   if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION
6672       && reduc_inputs.length () == 1)
6673     loop_vinfo->reusable_accumulators.put (scalar_results[0],
6674                                            { orig_reduc_input, reduc_info });
6675
6676   if (double_reduc)
6677     loop = outer_loop;
6678
6679   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
6680           phis with new adjusted scalar results, i.e., replace use <s_out0>
6681           with use <s_out4>.
6682
6683      Transform:
6684         loop_exit:
6685           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6686           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6687           v_out2 = reduce <v_out1>
6688           s_out3 = extract_field <v_out2, 0>
6689           s_out4 = adjust_result <s_out3>
6690           use <s_out0>
6691           use <s_out0>
6692
6693      into:
6694
6695         loop_exit:
6696           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6697           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6698           v_out2 = reduce <v_out1>
6699           s_out3 = extract_field <v_out2, 0>
6700           s_out4 = adjust_result <s_out3>
6701           use <s_out4>
6702           use <s_out4> */
6703
6704   gcc_assert (live_out_stmts.size () == scalar_results.length ());
6705   for (k = 0; k < live_out_stmts.size (); k++)
6706     {
6707       stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6708       scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6709
6710       phis.create (3);
6711       /* Find the loop-closed-use at the loop exit of the original scalar
6712          result.  (The reduction result is expected to have two immediate uses,
6713          one at the latch block, and one at the loop exit).  For double
6714          reductions we are looking for exit phis of the outer loop.  */
6715       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6716         {
6717           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6718             {
6719               if (!is_gimple_debug (USE_STMT (use_p)))
6720                 phis.safe_push (USE_STMT (use_p));
6721             }
6722           else
6723             {
6724               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6725                 {
6726                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
6727
6728                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6729                     {
6730                       if (!flow_bb_inside_loop_p (loop,
6731                                              gimple_bb (USE_STMT (phi_use_p)))
6732                           && !is_gimple_debug (USE_STMT (phi_use_p)))
6733                         phis.safe_push (USE_STMT (phi_use_p));
6734                     }
6735                 }
6736             }
6737         }
6738
6739       FOR_EACH_VEC_ELT (phis, i, exit_phi)
6740         {
6741           /* Replace the uses:  */
6742           orig_name = PHI_RESULT (exit_phi);
6743
6744           /* Look for a single use at the target of the skip edge.  */
6745           if (unify_with_main_loop_p)
6746             {
6747               use_operand_p use_p;
6748               gimple *user;
6749               if (!single_imm_use (orig_name, &use_p, &user))
6750                 gcc_unreachable ();
6751               orig_name = gimple_get_lhs (user);
6752             }
6753
6754           scalar_result = scalar_results[k];
6755           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6756             {
6757               FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6758                 SET_USE (use_p, scalar_result);
6759               update_stmt (use_stmt);
6760             }
6761         }
6762
6763       phis.release ();
6764     }
6765 }
6766
6767 /* Return a vector of type VECTYPE that is equal to the vector select
6768    operation "MASK ? VEC : IDENTITY".  Insert the select statements
6769    before GSI.  */
6770
6771 static tree
6772 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6773                      tree vec, tree identity)
6774 {
6775   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6776   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6777                                           mask, vec, identity);
6778   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6779   return cond;
6780 }
6781
6782 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6783    order, starting with LHS.  Insert the extraction statements before GSI and
6784    associate the new scalar SSA names with variable SCALAR_DEST.
6785    Return the SSA name for the result.  */
6786
6787 static tree
6788 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6789                        tree_code code, tree lhs, tree vector_rhs)
6790 {
6791   tree vectype = TREE_TYPE (vector_rhs);
6792   tree scalar_type = TREE_TYPE (vectype);
6793   tree bitsize = TYPE_SIZE (scalar_type);
6794   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6795   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6796
6797   for (unsigned HOST_WIDE_INT bit_offset = 0;
6798        bit_offset < vec_size_in_bits;
6799        bit_offset += element_bitsize)
6800     {
6801       tree bitpos = bitsize_int (bit_offset);
6802       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6803                          bitsize, bitpos);
6804
6805       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6806       rhs = make_ssa_name (scalar_dest, stmt);
6807       gimple_assign_set_lhs (stmt, rhs);
6808       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6809
6810       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6811       tree new_name = make_ssa_name (scalar_dest, stmt);
6812       gimple_assign_set_lhs (stmt, new_name);
6813       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6814       lhs = new_name;
6815     }
6816   return lhs;
6817 }
6818
6819 /* Get a masked internal function equivalent to REDUC_FN.  VECTYPE_IN is the
6820    type of the vector input.  */
6821
6822 static internal_fn
6823 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6824 {
6825   internal_fn mask_reduc_fn;
6826   internal_fn mask_len_reduc_fn;
6827
6828   switch (reduc_fn)
6829     {
6830     case IFN_FOLD_LEFT_PLUS:
6831       mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6832       mask_len_reduc_fn = IFN_MASK_LEN_FOLD_LEFT_PLUS;
6833       break;
6834
6835     default:
6836       return IFN_LAST;
6837     }
6838
6839   if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
6840                                       OPTIMIZE_FOR_SPEED))
6841     return mask_reduc_fn;
6842   if (direct_internal_fn_supported_p (mask_len_reduc_fn, vectype_in,
6843                                       OPTIMIZE_FOR_SPEED))
6844     return mask_len_reduc_fn;
6845   return IFN_LAST;
6846 }
6847
6848 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
6849    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
6850    statement.  CODE is the operation performed by STMT_INFO and OPS are
6851    its scalar operands.  REDUC_INDEX is the index of the operand in
6852    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
6853    implements in-order reduction, or IFN_LAST if we should open-code it.
6854    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
6855    that should be used to control the operation in a fully-masked loop.  */
6856
6857 static bool
6858 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
6859                                stmt_vec_info stmt_info,
6860                                gimple_stmt_iterator *gsi,
6861                                gimple **vec_stmt, slp_tree slp_node,
6862                                gimple *reduc_def_stmt,
6863                                tree_code code, internal_fn reduc_fn,
6864                                tree ops[3], tree vectype_in,
6865                                int reduc_index, vec_loop_masks *masks,
6866                                vec_loop_lens *lens)
6867 {
6868   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6869   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6870   internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
6871
6872   int ncopies;
6873   if (slp_node)
6874     ncopies = 1;
6875   else
6876     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6877
6878   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
6879   gcc_assert (ncopies == 1);
6880   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6881
6882   if (slp_node)
6883     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6884                           TYPE_VECTOR_SUBPARTS (vectype_in)));
6885
6886   tree op0 = ops[1 - reduc_index];
6887
6888   int group_size = 1;
6889   stmt_vec_info scalar_dest_def_info;
6890   auto_vec<tree> vec_oprnds0;
6891   if (slp_node)
6892     {
6893       auto_vec<vec<tree> > vec_defs (2);
6894       vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
6895       vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
6896       vec_defs[0].release ();
6897       vec_defs[1].release ();
6898       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6899       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6900     }
6901   else
6902     {
6903       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
6904                                      op0, &vec_oprnds0);
6905       scalar_dest_def_info = stmt_info;
6906     }
6907
6908   tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
6909   tree scalar_type = TREE_TYPE (scalar_dest);
6910   tree reduc_var = gimple_phi_result (reduc_def_stmt);
6911
6912   int vec_num = vec_oprnds0.length ();
6913   gcc_assert (vec_num == 1 || slp_node);
6914   tree vec_elem_type = TREE_TYPE (vectype_out);
6915   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6916
6917   tree vector_identity = NULL_TREE;
6918   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6919     {
6920       vector_identity = build_zero_cst (vectype_out);
6921       if (!HONOR_SIGNED_ZEROS (vectype_out))
6922         ;
6923       else
6924         {
6925           gcc_assert (!HONOR_SIGN_DEPENDENT_ROUNDING (vectype_out));
6926           vector_identity = const_unop (NEGATE_EXPR, vectype_out,
6927                                         vector_identity);
6928         }
6929     }
6930
6931   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6932   int i;
6933   tree def0;
6934   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6935     {
6936       gimple *new_stmt;
6937       tree mask = NULL_TREE;
6938       tree len = NULL_TREE;
6939       tree bias = NULL_TREE;
6940       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6941         mask = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num, vectype_in, i);
6942       if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
6943         {
6944           len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
6945                                    i, 1);
6946           signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
6947           bias = build_int_cst (intQI_type_node, biasval);
6948           mask = build_minus_one_cst (truth_type_for (vectype_in));
6949         }
6950
6951       /* Handle MINUS by adding the negative.  */
6952       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6953         {
6954           tree negated = make_ssa_name (vectype_out);
6955           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6956           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6957           def0 = negated;
6958         }
6959
6960       if (mask && mask_reduc_fn == IFN_LAST)
6961         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6962                                     vector_identity);
6963
6964       /* On the first iteration the input is simply the scalar phi
6965          result, and for subsequent iterations it is the output of
6966          the preceding operation.  */
6967       if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6968         {
6969           if (mask && len && mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
6970             new_stmt = gimple_build_call_internal (mask_reduc_fn, 5, reduc_var,
6971                                                    def0, mask, len, bias);
6972           else if (mask && mask_reduc_fn == IFN_MASK_FOLD_LEFT_PLUS)
6973             new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6974                                                    def0, mask);
6975           else
6976             new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6977                                                    def0);
6978           /* For chained SLP reductions the output of the previous reduction
6979              operation serves as the input of the next. For the final statement
6980              the output cannot be a temporary - we reuse the original
6981              scalar destination of the last statement.  */
6982           if (i != vec_num - 1)
6983             {
6984               gimple_set_lhs (new_stmt, scalar_dest_var);
6985               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6986               gimple_set_lhs (new_stmt, reduc_var);
6987             }
6988         }
6989       else
6990         {
6991           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6992                                              reduc_var, def0);
6993           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6994           /* Remove the statement, so that we can use the same code paths
6995              as for statements that we've just created.  */
6996           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6997           gsi_remove (&tmp_gsi, true);
6998         }
6999
7000       if (i == vec_num - 1)
7001         {
7002           gimple_set_lhs (new_stmt, scalar_dest);
7003           vect_finish_replace_stmt (loop_vinfo,
7004                                     scalar_dest_def_info,
7005                                     new_stmt);
7006         }
7007       else
7008         vect_finish_stmt_generation (loop_vinfo,
7009                                      scalar_dest_def_info,
7010                                      new_stmt, gsi);
7011
7012       if (slp_node)
7013         slp_node->push_vec_def (new_stmt);
7014       else
7015         {
7016           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7017           *vec_stmt = new_stmt;
7018         }
7019     }
7020
7021   return true;
7022 }
7023
7024 /* Function is_nonwrapping_integer_induction.
7025
7026    Check if STMT_VINO (which is part of loop LOOP) both increments and
7027    does not cause overflow.  */
7028
7029 static bool
7030 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
7031 {
7032   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
7033   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
7034   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
7035   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
7036   widest_int ni, max_loop_value, lhs_max;
7037   wi::overflow_type overflow = wi::OVF_NONE;
7038
7039   /* Make sure the loop is integer based.  */
7040   if (TREE_CODE (base) != INTEGER_CST
7041       || TREE_CODE (step) != INTEGER_CST)
7042     return false;
7043
7044   /* Check that the max size of the loop will not wrap.  */
7045
7046   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
7047     return true;
7048
7049   if (! max_stmt_executions (loop, &ni))
7050     return false;
7051
7052   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
7053                             &overflow);
7054   if (overflow)
7055     return false;
7056
7057   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
7058                             TYPE_SIGN (lhs_type), &overflow);
7059   if (overflow)
7060     return false;
7061
7062   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
7063           <= TYPE_PRECISION (lhs_type));
7064 }
7065
7066 /* Check if masking can be supported by inserting a conditional expression.
7067    CODE is the code for the operation.  COND_FN is the conditional internal
7068    function, if it exists.  VECTYPE_IN is the type of the vector input.  */
7069 static bool
7070 use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
7071                          tree vectype_in)
7072 {
7073   if (cond_fn != IFN_LAST
7074       && direct_internal_fn_supported_p (cond_fn, vectype_in,
7075                                          OPTIMIZE_FOR_SPEED))
7076     return false;
7077
7078   if (code.is_tree_code ())
7079     switch (tree_code (code))
7080       {
7081       case DOT_PROD_EXPR:
7082       case SAD_EXPR:
7083         return true;
7084
7085       default:
7086         break;
7087       }
7088   return false;
7089 }
7090
7091 /* Insert a conditional expression to enable masked vectorization.  CODE is the
7092    code for the operation.  VOP is the array of operands.  MASK is the loop
7093    mask.  GSI is a statement iterator used to place the new conditional
7094    expression.  */
7095 static void
7096 build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
7097                       gimple_stmt_iterator *gsi)
7098 {
7099   switch (tree_code (code))
7100     {
7101     case DOT_PROD_EXPR:
7102       {
7103         tree vectype = TREE_TYPE (vop[1]);
7104         tree zero = build_zero_cst (vectype);
7105         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7106         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7107                                                mask, vop[1], zero);
7108         gsi_insert_before (gsi, select, GSI_SAME_STMT);
7109         vop[1] = masked_op1;
7110         break;
7111       }
7112
7113     case SAD_EXPR:
7114       {
7115         tree vectype = TREE_TYPE (vop[1]);
7116         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7117         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7118                                                mask, vop[1], vop[0]);
7119         gsi_insert_before (gsi, select, GSI_SAME_STMT);
7120         vop[1] = masked_op1;
7121         break;
7122       }
7123
7124     default:
7125       gcc_unreachable ();
7126     }
7127 }
7128
7129 /* Function vectorizable_reduction.
7130
7131    Check if STMT_INFO performs a reduction operation that can be vectorized.
7132    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
7133    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
7134    Return true if STMT_INFO is vectorizable in this way.
7135
7136    This function also handles reduction idioms (patterns) that have been
7137    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
7138    may be of this form:
7139      X = pattern_expr (arg0, arg1, ..., X)
7140    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
7141    sequence that had been detected and replaced by the pattern-stmt
7142    (STMT_INFO).
7143
7144    This function also handles reduction of condition expressions, for example:
7145      for (int i = 0; i < N; i++)
7146        if (a[i] < value)
7147          last = a[i];
7148    This is handled by vectorising the loop and creating an additional vector
7149    containing the loop indexes for which "a[i] < value" was true.  In the
7150    function epilogue this is reduced to a single max value and then used to
7151    index into the vector of results.
7152
7153    In some cases of reduction patterns, the type of the reduction variable X is
7154    different than the type of the other arguments of STMT_INFO.
7155    In such cases, the vectype that is used when transforming STMT_INFO into
7156    a vector stmt is different than the vectype that is used to determine the
7157    vectorization factor, because it consists of a different number of elements
7158    than the actual number of elements that are being operated upon in parallel.
7159
7160    For example, consider an accumulation of shorts into an int accumulator.
7161    On some targets it's possible to vectorize this pattern operating on 8
7162    shorts at a time (hence, the vectype for purposes of determining the
7163    vectorization factor should be V8HI); on the other hand, the vectype that
7164    is used to create the vector form is actually V4SI (the type of the result).
7165
7166    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
7167    indicates what is the actual level of parallelism (V8HI in the example), so
7168    that the right vectorization factor would be derived.  This vectype
7169    corresponds to the type of arguments to the reduction stmt, and should *NOT*
7170    be used to create the vectorized stmt.  The right vectype for the vectorized
7171    stmt is obtained from the type of the result X:
7172       get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7173
7174    This means that, contrary to "regular" reductions (or "regular" stmts in
7175    general), the following equation:
7176       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7177    does *NOT* necessarily hold for reduction patterns.  */
7178
7179 bool
7180 vectorizable_reduction (loop_vec_info loop_vinfo,
7181                         stmt_vec_info stmt_info, slp_tree slp_node,
7182                         slp_instance slp_node_instance,
7183                         stmt_vector_for_cost *cost_vec)
7184 {
7185   tree vectype_in = NULL_TREE;
7186   tree vectype_op[3] = { NULL_TREE, NULL_TREE, NULL_TREE };
7187   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7188   enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
7189   stmt_vec_info cond_stmt_vinfo = NULL;
7190   int i;
7191   int ncopies;
7192   bool single_defuse_cycle = false;
7193   bool nested_cycle = false;
7194   bool double_reduc = false;
7195   int vec_num;
7196   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
7197   tree cond_reduc_val = NULL_TREE;
7198
7199   /* Make sure it was already recognized as a reduction computation.  */
7200   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
7201       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
7202       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
7203     return false;
7204
7205   /* The stmt we store reduction analysis meta on.  */
7206   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7207   reduc_info->is_reduc_info = true;
7208
7209   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
7210     {
7211       if (is_a <gphi *> (stmt_info->stmt))
7212         {
7213           if (slp_node)
7214             {
7215               /* We eventually need to set a vector type on invariant
7216                  arguments.  */
7217               unsigned j;
7218               slp_tree child;
7219               FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
7220                 if (!vect_maybe_update_slp_op_vectype
7221                        (child, SLP_TREE_VECTYPE (slp_node)))
7222                   {
7223                     if (dump_enabled_p ())
7224                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7225                                        "incompatible vector types for "
7226                                        "invariants\n");
7227                     return false;
7228                   }
7229             }
7230           /* Analysis for double-reduction is done on the outer
7231              loop PHI, nested cycles have no further restrictions.  */
7232           STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
7233         }
7234       else
7235         STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7236       return true;
7237     }
7238
7239   stmt_vec_info orig_stmt_of_analysis = stmt_info;
7240   stmt_vec_info phi_info = stmt_info;
7241   if (!is_a <gphi *> (stmt_info->stmt))
7242     {
7243       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7244       return true;
7245     }
7246   if (slp_node)
7247     {
7248       slp_node_instance->reduc_phis = slp_node;
7249       /* ???  We're leaving slp_node to point to the PHIs, we only
7250          need it to get at the number of vector stmts which wasn't
7251          yet initialized for the instance root.  */
7252     }
7253   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7254     {
7255       use_operand_p use_p;
7256       gimple *use_stmt;
7257       bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
7258                                  &use_p, &use_stmt);
7259       gcc_assert (res);
7260       phi_info = loop_vinfo->lookup_stmt (use_stmt);
7261     }
7262
7263   /* PHIs should not participate in patterns.  */
7264   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7265   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7266
7267   /* Verify following REDUC_IDX from the latch def leads us back to the PHI
7268      and compute the reduction chain length.  Discover the real
7269      reduction operation stmt on the way (stmt_info and slp_for_stmt_info).  */
7270   tree reduc_def
7271     = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
7272                              loop_latch_edge
7273                                (gimple_bb (reduc_def_phi)->loop_father));
7274   unsigned reduc_chain_length = 0;
7275   bool only_slp_reduc_chain = true;
7276   stmt_info = NULL;
7277   slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
7278   while (reduc_def != PHI_RESULT (reduc_def_phi))
7279     {
7280       stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
7281       stmt_vec_info vdef = vect_stmt_to_vectorize (def);
7282       if (STMT_VINFO_REDUC_IDX (vdef) == -1)
7283         {
7284           if (dump_enabled_p ())
7285             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7286                              "reduction chain broken by patterns.\n");
7287           return false;
7288         }
7289       if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
7290         only_slp_reduc_chain = false;
7291       /* For epilogue generation live members of the chain need
7292          to point back to the PHI via their original stmt for
7293          info_for_reduction to work.  For SLP we need to look at
7294          all lanes here - even though we only will vectorize from
7295          the SLP node with live lane zero the other live lanes also
7296          need to be identified as part of a reduction to be able
7297          to skip code generation for them.  */
7298       if (slp_for_stmt_info)
7299         {
7300           for (auto s : SLP_TREE_SCALAR_STMTS (slp_for_stmt_info))
7301             if (STMT_VINFO_LIVE_P (s))
7302               STMT_VINFO_REDUC_DEF (vect_orig_stmt (s)) = phi_info;
7303         }
7304       else if (STMT_VINFO_LIVE_P (vdef))
7305         STMT_VINFO_REDUC_DEF (def) = phi_info;
7306       gimple_match_op op;
7307       if (!gimple_extract_op (vdef->stmt, &op))
7308         {
7309           if (dump_enabled_p ())
7310             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7311                              "reduction chain includes unsupported"
7312                              " statement type.\n");
7313           return false;
7314         }
7315       if (CONVERT_EXPR_CODE_P (op.code))
7316         {
7317           if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
7318             {
7319               if (dump_enabled_p ())
7320                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7321                                  "conversion in the reduction chain.\n");
7322               return false;
7323             }
7324         }
7325       else if (!stmt_info)
7326         /* First non-conversion stmt.  */
7327         stmt_info = vdef;
7328       reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
7329       reduc_chain_length++;
7330       if (!stmt_info && slp_node)
7331         slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
7332     }
7333   /* PHIs should not participate in patterns.  */
7334   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7335
7336   if (nested_in_vect_loop_p (loop, stmt_info))
7337     {
7338       loop = loop->inner;
7339       nested_cycle = true;
7340     }
7341
7342   /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
7343      element.  */
7344   if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7345     {
7346       gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
7347       stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
7348     }
7349   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7350     gcc_assert (slp_node
7351                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
7352
7353   /* 1. Is vectorizable reduction?  */
7354   /* Not supportable if the reduction variable is used in the loop, unless
7355      it's a reduction chain.  */
7356   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
7357       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7358     return false;
7359
7360   /* Reductions that are not used even in an enclosing outer-loop,
7361      are expected to be "live" (used out of the loop).  */
7362   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
7363       && !STMT_VINFO_LIVE_P (stmt_info))
7364     return false;
7365
7366   /* 2. Has this been recognized as a reduction pattern?
7367
7368      Check if STMT represents a pattern that has been recognized
7369      in earlier analysis stages.  For stmts that represent a pattern,
7370      the STMT_VINFO_RELATED_STMT field records the last stmt in
7371      the original sequence that constitutes the pattern.  */
7372
7373   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
7374   if (orig_stmt_info)
7375     {
7376       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
7377       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
7378     }
7379
7380   /* 3. Check the operands of the operation.  The first operands are defined
7381         inside the loop body. The last operand is the reduction variable,
7382         which is defined by the loop-header-phi.  */
7383
7384   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7385   STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
7386   gimple_match_op op;
7387   if (!gimple_extract_op (stmt_info->stmt, &op))
7388     gcc_unreachable ();
7389   bool lane_reduc_code_p = (op.code == DOT_PROD_EXPR
7390                             || op.code == WIDEN_SUM_EXPR
7391                             || op.code == SAD_EXPR);
7392
7393   if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
7394       && !SCALAR_FLOAT_TYPE_P (op.type))
7395     return false;
7396
7397   /* Do not try to vectorize bit-precision reductions.  */
7398   if (!type_has_mode_precision_p (op.type))
7399     return false;
7400
7401   /* For lane-reducing ops we're reducing the number of reduction PHIs
7402      which means the only use of that may be in the lane-reducing operation.  */
7403   if (lane_reduc_code_p
7404       && reduc_chain_length != 1
7405       && !only_slp_reduc_chain)
7406     {
7407       if (dump_enabled_p ())
7408         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7409                          "lane-reducing reduction with extra stmts.\n");
7410       return false;
7411     }
7412
7413   /* All uses but the last are expected to be defined in the loop.
7414      The last use is the reduction variable.  In case of nested cycle this
7415      assumption is not true: we use reduc_index to record the index of the
7416      reduction variable.  */
7417   slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
7418   /* We need to skip an extra operand for COND_EXPRs with embedded
7419      comparison.  */
7420   unsigned opno_adjust = 0;
7421   if (op.code == COND_EXPR && COMPARISON_CLASS_P (op.ops[0]))
7422     opno_adjust = 1;
7423   for (i = 0; i < (int) op.num_ops; i++)
7424     {
7425       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
7426       if (i == 0 && op.code == COND_EXPR)
7427         continue;
7428
7429       stmt_vec_info def_stmt_info;
7430       enum vect_def_type dt;
7431       if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
7432                                i + opno_adjust, &op.ops[i], &slp_op[i], &dt,
7433                                &vectype_op[i], &def_stmt_info))
7434         {
7435           if (dump_enabled_p ())
7436             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7437                              "use not simple.\n");
7438           return false;
7439         }
7440       if (i == STMT_VINFO_REDUC_IDX (stmt_info))
7441         continue;
7442
7443       /* There should be only one cycle def in the stmt, the one
7444          leading to reduc_def.  */
7445       if (VECTORIZABLE_CYCLE_DEF (dt))
7446         return false;
7447
7448       if (!vectype_op[i])
7449         vectype_op[i]
7450           = get_vectype_for_scalar_type (loop_vinfo,
7451                                          TREE_TYPE (op.ops[i]), slp_op[i]);
7452
7453       /* To properly compute ncopies we are interested in the widest
7454          non-reduction input type in case we're looking at a widening
7455          accumulation that we later handle in vect_transform_reduction.  */
7456       if (lane_reduc_code_p
7457           && vectype_op[i]
7458           && (!vectype_in
7459               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
7460                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_op[i]))))))
7461         vectype_in = vectype_op[i];
7462
7463       if (op.code == COND_EXPR)
7464         {
7465           /* Record how the non-reduction-def value of COND_EXPR is defined.  */
7466           if (dt == vect_constant_def)
7467             {
7468               cond_reduc_dt = dt;
7469               cond_reduc_val = op.ops[i];
7470             }
7471           if (dt == vect_induction_def
7472               && def_stmt_info
7473               && is_nonwrapping_integer_induction (def_stmt_info, loop))
7474             {
7475               cond_reduc_dt = dt;
7476               cond_stmt_vinfo = def_stmt_info;
7477             }
7478         }
7479     }
7480   if (!vectype_in)
7481     vectype_in = STMT_VINFO_VECTYPE (phi_info);
7482   STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
7483
7484   enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
7485   STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
7486   /* If we have a condition reduction, see if we can simplify it further.  */
7487   if (v_reduc_type == COND_REDUCTION)
7488     {
7489       if (slp_node)
7490         return false;
7491
7492       /* When the condition uses the reduction value in the condition, fail.  */
7493       if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
7494         {
7495           if (dump_enabled_p ())
7496             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7497                              "condition depends on previous iteration\n");
7498           return false;
7499         }
7500
7501       if (reduc_chain_length == 1
7502           && (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST, vectype_in,
7503                                               OPTIMIZE_FOR_SPEED)
7504               || direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
7505                                                  vectype_in,
7506                                                  OPTIMIZE_FOR_SPEED)))
7507         {
7508           if (dump_enabled_p ())
7509             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7510                              "optimizing condition reduction with"
7511                              " FOLD_EXTRACT_LAST.\n");
7512           STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
7513         }
7514       else if (cond_reduc_dt == vect_induction_def)
7515         {
7516           tree base
7517             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
7518           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
7519
7520           gcc_assert (TREE_CODE (base) == INTEGER_CST
7521                       && TREE_CODE (step) == INTEGER_CST);
7522           cond_reduc_val = NULL_TREE;
7523           enum tree_code cond_reduc_op_code = ERROR_MARK;
7524           tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
7525           if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
7526             ;
7527           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
7528              above base; punt if base is the minimum value of the type for
7529              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
7530           else if (tree_int_cst_sgn (step) == -1)
7531             {
7532               cond_reduc_op_code = MIN_EXPR;
7533               if (tree_int_cst_sgn (base) == -1)
7534                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7535               else if (tree_int_cst_lt (base,
7536                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
7537                 cond_reduc_val
7538                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
7539             }
7540           else
7541             {
7542               cond_reduc_op_code = MAX_EXPR;
7543               if (tree_int_cst_sgn (base) == 1)
7544                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7545               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
7546                                         base))
7547                 cond_reduc_val
7548                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
7549             }
7550           if (cond_reduc_val)
7551             {
7552               if (dump_enabled_p ())
7553                 dump_printf_loc (MSG_NOTE, vect_location,
7554                                  "condition expression based on "
7555                                  "integer induction.\n");
7556               STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
7557               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
7558                 = cond_reduc_val;
7559               STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
7560             }
7561         }
7562       else if (cond_reduc_dt == vect_constant_def)
7563         {
7564           enum vect_def_type cond_initial_dt;
7565           tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
7566           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
7567           if (cond_initial_dt == vect_constant_def
7568               && types_compatible_p (TREE_TYPE (cond_initial_val),
7569                                      TREE_TYPE (cond_reduc_val)))
7570             {
7571               tree e = fold_binary (LE_EXPR, boolean_type_node,
7572                                     cond_initial_val, cond_reduc_val);
7573               if (e && (integer_onep (e) || integer_zerop (e)))
7574                 {
7575                   if (dump_enabled_p ())
7576                     dump_printf_loc (MSG_NOTE, vect_location,
7577                                      "condition expression based on "
7578                                      "compile time constant.\n");
7579                   /* Record reduction code at analysis stage.  */
7580                   STMT_VINFO_REDUC_CODE (reduc_info)
7581                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
7582                   STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
7583                 }
7584             }
7585         }
7586     }
7587
7588   if (STMT_VINFO_LIVE_P (phi_info))
7589     return false;
7590
7591   if (slp_node)
7592     ncopies = 1;
7593   else
7594     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7595
7596   gcc_assert (ncopies >= 1);
7597
7598   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
7599
7600   if (nested_cycle)
7601     {
7602       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
7603                   == vect_double_reduction_def);
7604       double_reduc = true;
7605     }
7606
7607   /* 4.2. Check support for the epilog operation.
7608
7609           If STMT represents a reduction pattern, then the type of the
7610           reduction variable may be different than the type of the rest
7611           of the arguments.  For example, consider the case of accumulation
7612           of shorts into an int accumulator; The original code:
7613                         S1: int_a = (int) short_a;
7614           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
7615
7616           was replaced with:
7617                         STMT: int_acc = widen_sum <short_a, int_acc>
7618
7619           This means that:
7620           1. The tree-code that is used to create the vector operation in the
7621              epilog code (that reduces the partial results) is not the
7622              tree-code of STMT, but is rather the tree-code of the original
7623              stmt from the pattern that STMT is replacing.  I.e, in the example
7624              above we want to use 'widen_sum' in the loop, but 'plus' in the
7625              epilog.
7626           2. The type (mode) we use to check available target support
7627              for the vector operation to be created in the *epilog*, is
7628              determined by the type of the reduction variable (in the example
7629              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7630              However the type (mode) we use to check available target support
7631              for the vector operation to be created *inside the loop*, is
7632              determined by the type of the other arguments to STMT (in the
7633              example we'd check this: optab_handler (widen_sum_optab,
7634              vect_short_mode)).
7635
7636           This is contrary to "regular" reductions, in which the types of all
7637           the arguments are the same as the type of the reduction variable.
7638           For "regular" reductions we can therefore use the same vector type
7639           (and also the same tree-code) when generating the epilog code and
7640           when generating the code inside the loop.  */
7641
7642   code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
7643   STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
7644
7645   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7646   if (reduction_type == TREE_CODE_REDUCTION)
7647     {
7648       /* Check whether it's ok to change the order of the computation.
7649          Generally, when vectorizing a reduction we change the order of the
7650          computation.  This may change the behavior of the program in some
7651          cases, so we need to check that this is ok.  One exception is when
7652          vectorizing an outer-loop: the inner-loop is executed sequentially,
7653          and therefore vectorizing reductions in the inner-loop during
7654          outer-loop vectorization is safe.  Likewise when we are vectorizing
7655          a series of reductions using SLP and the VF is one the reductions
7656          are performed in scalar order.  */
7657       if (slp_node
7658           && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7659           && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
7660         ;
7661       else if (needs_fold_left_reduction_p (op.type, orig_code))
7662         {
7663           /* When vectorizing a reduction chain w/o SLP the reduction PHI
7664              is not directy used in stmt.  */
7665           if (!only_slp_reduc_chain
7666               && reduc_chain_length != 1)
7667             {
7668               if (dump_enabled_p ())
7669                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7670                                  "in-order reduction chain without SLP.\n");
7671               return false;
7672             }
7673           STMT_VINFO_REDUC_TYPE (reduc_info)
7674             = reduction_type = FOLD_LEFT_REDUCTION;
7675         }
7676       else if (!commutative_binary_op_p (orig_code, op.type)
7677                || !associative_binary_op_p (orig_code, op.type))
7678         {
7679           if (dump_enabled_p ())
7680             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7681                             "reduction: not commutative/associative");
7682           return false;
7683         }
7684     }
7685
7686   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7687       && ncopies > 1)
7688     {
7689       if (dump_enabled_p ())
7690         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7691                          "multiple types in double reduction or condition "
7692                          "reduction or fold-left reduction.\n");
7693       return false;
7694     }
7695
7696   internal_fn reduc_fn = IFN_LAST;
7697   if (reduction_type == TREE_CODE_REDUCTION
7698       || reduction_type == FOLD_LEFT_REDUCTION
7699       || reduction_type == INTEGER_INDUC_COND_REDUCTION
7700       || reduction_type == CONST_COND_REDUCTION)
7701     {
7702       if (reduction_type == FOLD_LEFT_REDUCTION
7703           ? fold_left_reduction_fn (orig_code, &reduc_fn)
7704           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7705         {
7706           if (reduc_fn != IFN_LAST
7707               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
7708                                                   OPTIMIZE_FOR_SPEED))
7709             {
7710               if (dump_enabled_p ())
7711                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7712                                  "reduc op not supported by target.\n");
7713
7714               reduc_fn = IFN_LAST;
7715             }
7716         }
7717       else
7718         {
7719           if (!nested_cycle || double_reduc)
7720             {
7721               if (dump_enabled_p ())
7722                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7723                                  "no reduc code for scalar code.\n");
7724
7725               return false;
7726             }
7727         }
7728     }
7729   else if (reduction_type == COND_REDUCTION)
7730     {
7731       int scalar_precision
7732         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
7733       cr_index_scalar_type = make_unsigned_type (scalar_precision);
7734       cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
7735                                                 vectype_out);
7736
7737       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7738                                           OPTIMIZE_FOR_SPEED))
7739         reduc_fn = IFN_REDUC_MAX;
7740     }
7741   STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
7742
7743   if (reduction_type != EXTRACT_LAST_REDUCTION
7744       && (!nested_cycle || double_reduc)
7745       && reduc_fn == IFN_LAST
7746       && !nunits_out.is_constant ())
7747     {
7748       if (dump_enabled_p ())
7749         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7750                          "missing target support for reduction on"
7751                          " variable-length vectors.\n");
7752       return false;
7753     }
7754
7755   /* For SLP reductions, see if there is a neutral value we can use.  */
7756   tree neutral_op = NULL_TREE;
7757   if (slp_node)
7758     {
7759       tree initial_value = NULL_TREE;
7760       if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
7761         initial_value = vect_phi_initial_value (reduc_def_phi);
7762       neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7763                                              orig_code, initial_value);
7764     }
7765
7766   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7767     {
7768       /* We can't support in-order reductions of code such as this:
7769
7770            for (int i = 0; i < n1; ++i)
7771              for (int j = 0; j < n2; ++j)
7772                l += a[j];
7773
7774          since GCC effectively transforms the loop when vectorizing:
7775
7776            for (int i = 0; i < n1 / VF; ++i)
7777              for (int j = 0; j < n2; ++j)
7778                for (int k = 0; k < VF; ++k)
7779                  l += a[j];
7780
7781          which is a reassociation of the original operation.  */
7782       if (dump_enabled_p ())
7783         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7784                          "in-order double reduction not supported.\n");
7785
7786       return false;
7787     }
7788
7789   if (reduction_type == FOLD_LEFT_REDUCTION
7790       && slp_node
7791       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7792     {
7793       /* We cannot use in-order reductions in this case because there is
7794          an implicit reassociation of the operations involved.  */
7795       if (dump_enabled_p ())
7796         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7797                          "in-order unchained SLP reductions not supported.\n");
7798       return false;
7799     }
7800
7801   /* For double reductions, and for SLP reductions with a neutral value,
7802      we construct a variable-length initial vector by loading a vector
7803      full of the neutral value and then shift-and-inserting the start
7804      values into the low-numbered elements.  */
7805   if ((double_reduc || neutral_op)
7806       && !nunits_out.is_constant ()
7807       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7808                                           vectype_out, OPTIMIZE_FOR_SPEED))
7809     {
7810       if (dump_enabled_p ())
7811         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7812                          "reduction on variable-length vectors requires"
7813                          " target support for a vector-shift-and-insert"
7814                          " operation.\n");
7815       return false;
7816     }
7817
7818   /* Check extra constraints for variable-length unchained SLP reductions.  */
7819   if (slp_node
7820       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7821       && !nunits_out.is_constant ())
7822     {
7823       /* We checked above that we could build the initial vector when
7824          there's a neutral element value.  Check here for the case in
7825          which each SLP statement has its own initial value and in which
7826          that value needs to be repeated for every instance of the
7827          statement within the initial vector.  */
7828       unsigned int group_size = SLP_TREE_LANES (slp_node);
7829       if (!neutral_op
7830           && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
7831                                               TREE_TYPE (vectype_out)))
7832         {
7833           if (dump_enabled_p ())
7834             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7835                              "unsupported form of SLP reduction for"
7836                              " variable-length vectors: cannot build"
7837                              " initial vector.\n");
7838           return false;
7839         }
7840       /* The epilogue code relies on the number of elements being a multiple
7841          of the group size.  The duplicate-and-interleave approach to setting
7842          up the initial vector does too.  */
7843       if (!multiple_p (nunits_out, group_size))
7844         {
7845           if (dump_enabled_p ())
7846             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7847                              "unsupported form of SLP reduction for"
7848                              " variable-length vectors: the vector size"
7849                              " is not a multiple of the number of results.\n");
7850           return false;
7851         }
7852     }
7853
7854   if (reduction_type == COND_REDUCTION)
7855     {
7856       widest_int ni;
7857
7858       if (! max_loop_iterations (loop, &ni))
7859         {
7860           if (dump_enabled_p ())
7861             dump_printf_loc (MSG_NOTE, vect_location,
7862                              "loop count not known, cannot create cond "
7863                              "reduction.\n");
7864           return false;
7865         }
7866       /* Convert backedges to iterations.  */
7867       ni += 1;
7868
7869       /* The additional index will be the same type as the condition.  Check
7870          that the loop can fit into this less one (because we'll use up the
7871          zero slot for when there are no matches).  */
7872       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7873       if (wi::geu_p (ni, wi::to_widest (max_index)))
7874         {
7875           if (dump_enabled_p ())
7876             dump_printf_loc (MSG_NOTE, vect_location,
7877                              "loop size is greater than data size.\n");
7878           return false;
7879         }
7880     }
7881
7882   /* In case the vectorization factor (VF) is bigger than the number
7883      of elements that we can fit in a vectype (nunits), we have to generate
7884      more than one vector stmt - i.e - we need to "unroll" the
7885      vector stmt by a factor VF/nunits.  For more details see documentation
7886      in vectorizable_operation.  */
7887
7888   /* If the reduction is used in an outer loop we need to generate
7889      VF intermediate results, like so (e.g. for ncopies=2):
7890         r0 = phi (init, r0)
7891         r1 = phi (init, r1)
7892         r0 = x0 + r0;
7893         r1 = x1 + r1;
7894     (i.e. we generate VF results in 2 registers).
7895     In this case we have a separate def-use cycle for each copy, and therefore
7896     for each copy we get the vector def for the reduction variable from the
7897     respective phi node created for this copy.
7898
7899     Otherwise (the reduction is unused in the loop nest), we can combine
7900     together intermediate results, like so (e.g. for ncopies=2):
7901         r = phi (init, r)
7902         r = x0 + r;
7903         r = x1 + r;
7904    (i.e. we generate VF/2 results in a single register).
7905    In this case for each copy we get the vector def for the reduction variable
7906    from the vectorized reduction operation generated in the previous iteration.
7907
7908    This only works when we see both the reduction PHI and its only consumer
7909    in vectorizable_reduction and there are no intermediate stmts
7910    participating.  When unrolling we want each unrolled iteration to have its
7911    own reduction accumulator since one of the main goals of unrolling a
7912    reduction is to reduce the aggregate loop-carried latency.  */
7913   if (ncopies > 1
7914       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7915       && reduc_chain_length == 1
7916       && loop_vinfo->suggested_unroll_factor == 1)
7917     single_defuse_cycle = true;
7918
7919   if (single_defuse_cycle || lane_reduc_code_p)
7920     {
7921       gcc_assert (op.code != COND_EXPR);
7922
7923       /* 4. Supportable by target?  */
7924       bool ok = true;
7925
7926       /* 4.1. check support for the operation in the loop
7927
7928          This isn't necessary for the lane reduction codes, since they
7929          can only be produced by pattern matching, and it's up to the
7930          pattern matcher to test for support.  The main reason for
7931          specifically skipping this step is to avoid rechecking whether
7932          mixed-sign dot-products can be implemented using signed
7933          dot-products.  */
7934       machine_mode vec_mode = TYPE_MODE (vectype_in);
7935       if (!lane_reduc_code_p
7936           && !directly_supported_p (op.code, vectype_in, optab_vector))
7937         {
7938           if (dump_enabled_p ())
7939             dump_printf (MSG_NOTE, "op not supported by target.\n");
7940           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
7941               || !vect_can_vectorize_without_simd_p (op.code))
7942             ok = false;
7943           else
7944             if (dump_enabled_p ())
7945               dump_printf (MSG_NOTE, "proceeding using word mode.\n");
7946         }
7947
7948       if (vect_emulated_vector_p (vectype_in)
7949           && !vect_can_vectorize_without_simd_p (op.code))
7950         {
7951           if (dump_enabled_p ())
7952             dump_printf (MSG_NOTE, "using word mode not possible.\n");
7953           return false;
7954         }
7955
7956       /* lane-reducing operations have to go through vect_transform_reduction.
7957          For the other cases try without the single cycle optimization.  */
7958       if (!ok)
7959         {
7960           if (lane_reduc_code_p)
7961             return false;
7962           else
7963             single_defuse_cycle = false;
7964         }
7965     }
7966   STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
7967
7968   /* If the reduction stmt is one of the patterns that have lane
7969      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
7970   if ((ncopies > 1 && ! single_defuse_cycle)
7971       && lane_reduc_code_p)
7972     {
7973       if (dump_enabled_p ())
7974         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7975                          "multi def-use cycle not possible for lane-reducing "
7976                          "reduction operation\n");
7977       return false;
7978     }
7979
7980   if (slp_node
7981       && !(!single_defuse_cycle
7982            && !lane_reduc_code_p
7983            && reduction_type != FOLD_LEFT_REDUCTION))
7984     for (i = 0; i < (int) op.num_ops; i++)
7985       if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
7986         {
7987           if (dump_enabled_p ())
7988             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7989                              "incompatible vector types for invariants\n");
7990           return false;
7991         }
7992
7993   if (slp_node)
7994     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7995   else
7996     vec_num = 1;
7997
7998   vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
7999                              reduction_type, ncopies, cost_vec);
8000   /* Cost the reduction op inside the loop if transformed via
8001      vect_transform_reduction.  Otherwise this is costed by the
8002      separate vectorizable_* routines.  */
8003   if (single_defuse_cycle || lane_reduc_code_p)
8004     {
8005       int factor = 1;
8006       if (vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info))
8007         /* Three dot-products and a subtraction.  */
8008         factor = 4;
8009       record_stmt_cost (cost_vec, ncopies * factor, vector_stmt,
8010                         stmt_info, 0, vect_body);
8011     }
8012
8013   if (dump_enabled_p ()
8014       && reduction_type == FOLD_LEFT_REDUCTION)
8015     dump_printf_loc (MSG_NOTE, vect_location,
8016                      "using an in-order (fold-left) reduction.\n");
8017   STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
8018   /* All but single defuse-cycle optimized, lane-reducing and fold-left
8019      reductions go through their own vectorizable_* routines.  */
8020   if (!single_defuse_cycle
8021       && !lane_reduc_code_p
8022       && reduction_type != FOLD_LEFT_REDUCTION)
8023     {
8024       stmt_vec_info tem
8025         = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
8026       if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
8027         {
8028           gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
8029           tem = REDUC_GROUP_FIRST_ELEMENT (tem);
8030         }
8031       STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
8032       STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
8033     }
8034   else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8035     {
8036       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8037       vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8038       internal_fn cond_fn = get_conditional_internal_fn (op.code, op.type);
8039
8040       if (reduction_type != FOLD_LEFT_REDUCTION
8041           && !use_mask_by_cond_expr_p (op.code, cond_fn, vectype_in)
8042           && (cond_fn == IFN_LAST
8043               || !direct_internal_fn_supported_p (cond_fn, vectype_in,
8044                                                   OPTIMIZE_FOR_SPEED)))
8045         {
8046           if (dump_enabled_p ())
8047             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8048                              "can't operate on partial vectors because"
8049                              " no conditional operation is available.\n");
8050           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8051         }
8052       else if (reduction_type == FOLD_LEFT_REDUCTION
8053                && reduc_fn == IFN_LAST
8054                && !expand_vec_cond_expr_p (vectype_in,
8055                                            truth_type_for (vectype_in),
8056                                            SSA_NAME))
8057         {
8058           if (dump_enabled_p ())
8059             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8060                              "can't operate on partial vectors because"
8061                              " no conditional operation is available.\n");
8062           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8063         }
8064       else if (reduction_type == FOLD_LEFT_REDUCTION
8065                && reduc_fn == IFN_LAST
8066                && FLOAT_TYPE_P (vectype_in)
8067                && HONOR_SIGNED_ZEROS (vectype_in)
8068                && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in))
8069         {
8070           if (dump_enabled_p ())
8071             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8072                              "can't operate on partial vectors because"
8073                              " signed zeros cannot be preserved.\n");
8074           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8075         }
8076       else
8077         {
8078           internal_fn mask_reduc_fn
8079             = get_masked_reduction_fn (reduc_fn, vectype_in);
8080
8081           if (mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
8082             vect_record_loop_len (loop_vinfo, lens, ncopies * vec_num,
8083                                   vectype_in, 1);
8084           else
8085             vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
8086                                    vectype_in, NULL);
8087         }
8088     }
8089   return true;
8090 }
8091
8092 /* STMT_INFO is a dot-product reduction whose multiplication operands
8093    have different signs.  Emit a sequence to emulate the operation
8094    using a series of signed DOT_PROD_EXPRs and return the last
8095    statement generated.  VEC_DEST is the result of the vector operation
8096    and VOP lists its inputs.  */
8097
8098 static gassign *
8099 vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8100                              gimple_stmt_iterator *gsi, tree vec_dest,
8101                              tree vop[3])
8102 {
8103   tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest));
8104   tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0]));
8105   tree narrow_elttype = TREE_TYPE (narrow_vectype);
8106   gimple *new_stmt;
8107
8108   /* Make VOP[0] the unsigned operand VOP[1] the signed operand.  */
8109   if (!TYPE_UNSIGNED (TREE_TYPE (vop[0])))
8110     std::swap (vop[0], vop[1]);
8111
8112   /* Convert all inputs to signed types.  */
8113   for (int i = 0; i < 3; ++i)
8114     if (TYPE_UNSIGNED (TREE_TYPE (vop[i])))
8115       {
8116         tree tmp = make_ssa_name (signed_type_for (TREE_TYPE (vop[i])));
8117         new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]);
8118         vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8119         vop[i] = tmp;
8120       }
8121
8122   /* In the comments below we assume 8-bit inputs for simplicity,
8123      but the approach works for any full integer type.  */
8124
8125   /* Create a vector of -128.  */
8126   tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype);
8127   tree min_narrow = build_vector_from_val (narrow_vectype,
8128                                            min_narrow_elttype);
8129
8130   /* Create a vector of 64.  */
8131   auto half_wi = wi::lrshift (wi::to_wide (min_narrow_elttype), 1);
8132   tree half_narrow = wide_int_to_tree (narrow_elttype, half_wi);
8133   half_narrow = build_vector_from_val (narrow_vectype, half_narrow);
8134
8135   /* Emit: SUB_RES = VOP[0] - 128.  */
8136   tree sub_res = make_ssa_name (narrow_vectype);
8137   new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow);
8138   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8139
8140   /* Emit:
8141
8142        STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
8143        STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
8144        STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
8145
8146      on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
8147      Doing the two 64 * y steps first allows more time to compute x.  */
8148   tree stage1 = make_ssa_name (wide_vectype);
8149   new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR,
8150                                   vop[1], half_narrow, vop[2]);
8151   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8152
8153   tree stage2 = make_ssa_name (wide_vectype);
8154   new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR,
8155                                   vop[1], half_narrow, stage1);
8156   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8157
8158   tree stage3 = make_ssa_name (wide_vectype);
8159   new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR,
8160                                   sub_res, vop[1], stage2);
8161   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8162
8163   /* Convert STAGE3 to the reduction type.  */
8164   return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3);
8165 }
8166
8167 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
8168    value.  */
8169
8170 bool
8171 vect_transform_reduction (loop_vec_info loop_vinfo,
8172                           stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
8173                           gimple **vec_stmt, slp_tree slp_node)
8174 {
8175   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8176   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8177   int i;
8178   int ncopies;
8179   int vec_num;
8180
8181   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8182   gcc_assert (reduc_info->is_reduc_info);
8183
8184   if (nested_in_vect_loop_p (loop, stmt_info))
8185     {
8186       loop = loop->inner;
8187       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
8188     }
8189
8190   gimple_match_op op;
8191   if (!gimple_extract_op (stmt_info->stmt, &op))
8192     gcc_unreachable ();
8193
8194   /* All uses but the last are expected to be defined in the loop.
8195      The last use is the reduction variable.  In case of nested cycle this
8196      assumption is not true: we use reduc_index to record the index of the
8197      reduction variable.  */
8198   stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
8199   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
8200   int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
8201   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8202
8203   if (slp_node)
8204     {
8205       ncopies = 1;
8206       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8207     }
8208   else
8209     {
8210       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8211       vec_num = 1;
8212     }
8213
8214   code_helper code = canonicalize_code (op.code, op.type);
8215   internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
8216   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8217   vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8218   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
8219
8220   /* Transform.  */
8221   tree new_temp = NULL_TREE;
8222   auto_vec<tree> vec_oprnds0;
8223   auto_vec<tree> vec_oprnds1;
8224   auto_vec<tree> vec_oprnds2;
8225   tree def0;
8226
8227   if (dump_enabled_p ())
8228     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
8229
8230   /* FORNOW: Multiple types are not supported for condition.  */
8231   if (code == COND_EXPR)
8232     gcc_assert (ncopies == 1);
8233
8234   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8235
8236   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
8237   if (reduction_type == FOLD_LEFT_REDUCTION)
8238     {
8239       internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
8240       gcc_assert (code.is_tree_code ());
8241       return vectorize_fold_left_reduction
8242           (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
8243            tree_code (code), reduc_fn, op.ops, vectype_in, reduc_index, masks,
8244            lens);
8245     }
8246
8247   bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
8248   gcc_assert (single_defuse_cycle
8249               || code == DOT_PROD_EXPR
8250               || code == WIDEN_SUM_EXPR
8251               || code == SAD_EXPR);
8252
8253   /* Create the destination vector  */
8254   tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
8255   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
8256
8257   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
8258                      single_defuse_cycle && reduc_index == 0
8259                      ? NULL_TREE : op.ops[0], &vec_oprnds0,
8260                      single_defuse_cycle && reduc_index == 1
8261                      ? NULL_TREE : op.ops[1], &vec_oprnds1,
8262                      op.num_ops == 3
8263                      && !(single_defuse_cycle && reduc_index == 2)
8264                      ? op.ops[2] : NULL_TREE, &vec_oprnds2);
8265   if (single_defuse_cycle)
8266     {
8267       gcc_assert (!slp_node);
8268       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8269                                      op.ops[reduc_index],
8270                                      reduc_index == 0 ? &vec_oprnds0
8271                                      : (reduc_index == 1 ? &vec_oprnds1
8272                                         : &vec_oprnds2));
8273     }
8274
8275   bool emulated_mixed_dot_prod
8276     = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
8277   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
8278     {
8279       gimple *new_stmt;
8280       tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
8281       if (masked_loop_p && !mask_by_cond_expr)
8282         {
8283           /* No conditional ifns have been defined for dot-product yet.  */
8284           gcc_assert (code != DOT_PROD_EXPR);
8285
8286           /* Make sure that the reduction accumulator is vop[0].  */
8287           if (reduc_index == 1)
8288             {
8289               gcc_assert (commutative_binary_op_p (code, op.type));
8290               std::swap (vop[0], vop[1]);
8291             }
8292           tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8293                                           vec_num * ncopies, vectype_in, i);
8294           gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
8295                                                     vop[0], vop[1], vop[0]);
8296           new_temp = make_ssa_name (vec_dest, call);
8297           gimple_call_set_lhs (call, new_temp);
8298           gimple_call_set_nothrow (call, true);
8299           vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
8300           new_stmt = call;
8301         }
8302       else
8303         {
8304           if (op.num_ops == 3)
8305             vop[2] = vec_oprnds2[i];
8306
8307           if (masked_loop_p && mask_by_cond_expr)
8308             {
8309               tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8310                                               vec_num * ncopies, vectype_in, i);
8311               build_vect_cond_expr (code, vop, mask, gsi);
8312             }
8313
8314           if (emulated_mixed_dot_prod)
8315             new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
8316                                                     vec_dest, vop);
8317           else if (code.is_internal_fn ())
8318             new_stmt = gimple_build_call_internal (internal_fn (code),
8319                                                    op.num_ops,
8320                                                    vop[0], vop[1], vop[2]);
8321           else
8322             new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
8323                                             vop[0], vop[1], vop[2]);
8324           new_temp = make_ssa_name (vec_dest, new_stmt);
8325           gimple_set_lhs (new_stmt, new_temp);
8326           vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8327         }
8328
8329       if (slp_node)
8330         slp_node->push_vec_def (new_stmt);
8331       else if (single_defuse_cycle
8332                && i < ncopies - 1)
8333         {
8334           if (reduc_index == 0)
8335             vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
8336           else if (reduc_index == 1)
8337             vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
8338           else if (reduc_index == 2)
8339             vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
8340         }
8341       else
8342         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8343     }
8344
8345   if (!slp_node)
8346     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8347
8348   return true;
8349 }
8350
8351 /* Transform phase of a cycle PHI.  */
8352
8353 bool
8354 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
8355                           stmt_vec_info stmt_info, gimple **vec_stmt,
8356                           slp_tree slp_node, slp_instance slp_node_instance)
8357 {
8358   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8359   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8360   int i;
8361   int ncopies;
8362   int j;
8363   bool nested_cycle = false;
8364   int vec_num;
8365
8366   if (nested_in_vect_loop_p (loop, stmt_info))
8367     {
8368       loop = loop->inner;
8369       nested_cycle = true;
8370     }
8371
8372   stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
8373   reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
8374   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8375   gcc_assert (reduc_info->is_reduc_info);
8376
8377   if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
8378       || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
8379     /* Leave the scalar phi in place.  */
8380     return true;
8381
8382   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8383   /* For a nested cycle we do not fill the above.  */
8384   if (!vectype_in)
8385     vectype_in = STMT_VINFO_VECTYPE (stmt_info);
8386   gcc_assert (vectype_in);
8387
8388   if (slp_node)
8389     {
8390       /* The size vect_schedule_slp_instance computes is off for us.  */
8391       vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
8392                                       * SLP_TREE_LANES (slp_node), vectype_in);
8393       ncopies = 1;
8394     }
8395   else
8396     {
8397       vec_num = 1;
8398       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8399     }
8400
8401   /* Check whether we should use a single PHI node and accumulate
8402      vectors to one before the backedge.  */
8403   if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
8404     ncopies = 1;
8405
8406   /* Create the destination vector  */
8407   gphi *phi = as_a <gphi *> (stmt_info->stmt);
8408   tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
8409                                                vectype_out);
8410
8411   /* Get the loop-entry arguments.  */
8412   tree vec_initial_def = NULL_TREE;
8413   auto_vec<tree> vec_initial_defs;
8414   if (slp_node)
8415     {
8416       vec_initial_defs.reserve (vec_num);
8417       if (nested_cycle)
8418         {
8419           unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
8420           vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
8421                              &vec_initial_defs);
8422         }
8423       else
8424         {
8425           gcc_assert (slp_node == slp_node_instance->reduc_phis);
8426           vec<tree> &initial_values = reduc_info->reduc_initial_values;
8427           vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
8428
8429           unsigned int num_phis = stmts.length ();
8430           if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
8431             num_phis = 1;
8432           initial_values.reserve (num_phis);
8433           for (unsigned int i = 0; i < num_phis; ++i)
8434             {
8435               gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
8436               initial_values.quick_push (vect_phi_initial_value (this_phi));
8437             }
8438           if (vec_num == 1)
8439             vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8440           if (!initial_values.is_empty ())
8441             {
8442               tree initial_value
8443                 = (num_phis == 1 ? initial_values[0] : NULL_TREE);
8444               code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8445               tree neutral_op
8446                 = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8447                                             code, initial_value);
8448               get_initial_defs_for_reduction (loop_vinfo, reduc_info,
8449                                               &vec_initial_defs, vec_num,
8450                                               stmts.length (), neutral_op);
8451             }
8452         }
8453     }
8454   else
8455     {
8456       /* Get at the scalar def before the loop, that defines the initial
8457          value of the reduction variable.  */
8458       tree initial_def = vect_phi_initial_value (phi);
8459       reduc_info->reduc_initial_values.safe_push (initial_def);
8460       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
8461          and we can't use zero for induc_val, use initial_def.  Similarly
8462          for REDUC_MIN and initial_def larger than the base.  */
8463       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
8464         {
8465           tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
8466           if (TREE_CODE (initial_def) == INTEGER_CST
8467               && !integer_zerop (induc_val)
8468               && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
8469                    && tree_int_cst_lt (initial_def, induc_val))
8470                   || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
8471                       && tree_int_cst_lt (induc_val, initial_def))))
8472             {
8473               induc_val = initial_def;
8474               /* Communicate we used the initial_def to epilouge
8475                  generation.  */
8476               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
8477             }
8478           vec_initial_def = build_vector_from_val (vectype_out, induc_val);
8479         }
8480       else if (nested_cycle)
8481         {
8482           /* Do not use an adjustment def as that case is not supported
8483              correctly if ncopies is not one.  */
8484           vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
8485                                          ncopies, initial_def,
8486                                          &vec_initial_defs);
8487         }
8488       else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
8489                || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
8490         /* Fill the initial vector with the initial scalar value.  */
8491         vec_initial_def
8492           = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
8493                                            initial_def, initial_def);
8494       else
8495         {
8496           if (ncopies == 1)
8497             vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8498           if (!reduc_info->reduc_initial_values.is_empty ())
8499             {
8500               initial_def = reduc_info->reduc_initial_values[0];
8501               code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8502               tree neutral_op
8503                 = neutral_op_for_reduction (TREE_TYPE (initial_def),
8504                                             code, initial_def);
8505               gcc_assert (neutral_op);
8506               /* Try to simplify the vector initialization by applying an
8507                  adjustment after the reduction has been performed.  */
8508               if (!reduc_info->reused_accumulator
8509                   && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8510                   && !operand_equal_p (neutral_op, initial_def))
8511                 {
8512                   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
8513                     = initial_def;
8514                   initial_def = neutral_op;
8515                 }
8516               vec_initial_def
8517                 = get_initial_def_for_reduction (loop_vinfo, reduc_info,
8518                                                  initial_def, neutral_op);
8519             }
8520         }
8521     }
8522
8523   if (vec_initial_def)
8524     {
8525       vec_initial_defs.create (ncopies);
8526       for (i = 0; i < ncopies; ++i)
8527         vec_initial_defs.quick_push (vec_initial_def);
8528     }
8529
8530   if (auto *accumulator = reduc_info->reused_accumulator)
8531     {
8532       tree def = accumulator->reduc_input;
8533       if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8534         {
8535           unsigned int nreduc;
8536           bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
8537                                             (TREE_TYPE (def)),
8538                                           TYPE_VECTOR_SUBPARTS (vectype_out),
8539                                           &nreduc);
8540           gcc_assert (res);
8541           gimple_seq stmts = NULL;
8542           /* Reduce the single vector to a smaller one.  */
8543           if (nreduc != 1)
8544             {
8545               /* Perform the reduction in the appropriate type.  */
8546               tree rvectype = vectype_out;
8547               if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
8548                                               TREE_TYPE (TREE_TYPE (def))))
8549                 rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
8550                                               TYPE_VECTOR_SUBPARTS
8551                                                 (vectype_out));
8552               def = vect_create_partial_epilog (def, rvectype,
8553                                                 STMT_VINFO_REDUC_CODE
8554                                                   (reduc_info),
8555                                                 &stmts);
8556             }
8557           /* The epilogue loop might use a different vector mode, like
8558              VNx2DI vs. V2DI.  */
8559           if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
8560             {
8561               tree reduc_type = build_vector_type_for_mode
8562                 (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
8563               def = gimple_convert (&stmts, reduc_type, def);
8564             }
8565           /* Adjust the input so we pick up the partially reduced value
8566              for the skip edge in vect_create_epilog_for_reduction.  */
8567           accumulator->reduc_input = def;
8568           /* And the reduction could be carried out using a different sign.  */
8569           if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8570             def = gimple_convert (&stmts, vectype_out, def);
8571           if (loop_vinfo->main_loop_edge)
8572             {
8573               /* While we'd like to insert on the edge this will split
8574                  blocks and disturb bookkeeping, we also will eventually
8575                  need this on the skip edge.  Rely on sinking to
8576                  fixup optimal placement and insert in the pred.  */
8577               gimple_stmt_iterator gsi
8578                 = gsi_last_bb (loop_vinfo->main_loop_edge->src);
8579               /* Insert before a cond that eventually skips the
8580                  epilogue.  */
8581               if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
8582                 gsi_prev (&gsi);
8583               gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
8584             }
8585           else
8586             gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
8587                                               stmts);
8588         }
8589       if (loop_vinfo->main_loop_edge)
8590         vec_initial_defs[0]
8591           = vect_get_main_loop_result (loop_vinfo, def,
8592                                        vec_initial_defs[0]);
8593       else
8594         vec_initial_defs.safe_push (def);
8595     }
8596
8597   /* Generate the reduction PHIs upfront.  */
8598   for (i = 0; i < vec_num; i++)
8599     {
8600       tree vec_init_def = vec_initial_defs[i];
8601       for (j = 0; j < ncopies; j++)
8602         {
8603           /* Create the reduction-phi that defines the reduction
8604              operand.  */
8605           gphi *new_phi = create_phi_node (vec_dest, loop->header);
8606
8607           /* Set the loop-entry arg of the reduction-phi.  */
8608           if (j != 0 && nested_cycle)
8609             vec_init_def = vec_initial_defs[j];
8610           add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
8611                        UNKNOWN_LOCATION);
8612
8613           /* The loop-latch arg is set in epilogue processing.  */
8614
8615           if (slp_node)
8616             slp_node->push_vec_def (new_phi);
8617           else
8618             {
8619               if (j == 0)
8620                 *vec_stmt = new_phi;
8621               STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8622             }
8623         }
8624     }
8625
8626   return true;
8627 }
8628
8629 /* Vectorizes LC PHIs.  */
8630
8631 bool
8632 vectorizable_lc_phi (loop_vec_info loop_vinfo,
8633                      stmt_vec_info stmt_info, gimple **vec_stmt,
8634                      slp_tree slp_node)
8635 {
8636   if (!loop_vinfo
8637       || !is_a <gphi *> (stmt_info->stmt)
8638       || gimple_phi_num_args (stmt_info->stmt) != 1)
8639     return false;
8640
8641   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
8642       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
8643     return false;
8644
8645   if (!vec_stmt) /* transformation not required.  */
8646     {
8647       /* Deal with copies from externs or constants that disguise as
8648          loop-closed PHI nodes (PR97886).  */
8649       if (slp_node
8650           && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
8651                                                 SLP_TREE_VECTYPE (slp_node)))
8652         {
8653           if (dump_enabled_p ())
8654             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8655                              "incompatible vector types for invariants\n");
8656           return false;
8657         }
8658       STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
8659       return true;
8660     }
8661
8662   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8663   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8664   basic_block bb = gimple_bb (stmt_info->stmt);
8665   edge e = single_pred_edge (bb);
8666   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8667   auto_vec<tree> vec_oprnds;
8668   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
8669                      !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
8670                      gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
8671   for (unsigned i = 0; i < vec_oprnds.length (); i++)
8672     {
8673       /* Create the vectorized LC PHI node.  */
8674       gphi *new_phi = create_phi_node (vec_dest, bb);
8675       add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
8676       if (slp_node)
8677         slp_node->push_vec_def (new_phi);
8678       else
8679         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8680     }
8681   if (!slp_node)
8682     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8683
8684   return true;
8685 }
8686
8687 /* Vectorizes PHIs.  */
8688
8689 bool
8690 vectorizable_phi (vec_info *,
8691                   stmt_vec_info stmt_info, gimple **vec_stmt,
8692                   slp_tree slp_node, stmt_vector_for_cost *cost_vec)
8693 {
8694   if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
8695     return false;
8696
8697   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
8698     return false;
8699
8700   tree vectype = SLP_TREE_VECTYPE (slp_node);
8701
8702   if (!vec_stmt) /* transformation not required.  */
8703     {
8704       slp_tree child;
8705       unsigned i;
8706       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
8707         if (!child)
8708           {
8709             if (dump_enabled_p ())
8710               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8711                                "PHI node with unvectorized backedge def\n");
8712             return false;
8713           }
8714         else if (!vect_maybe_update_slp_op_vectype (child, vectype))
8715           {
8716             if (dump_enabled_p ())
8717               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8718                                "incompatible vector types for invariants\n");
8719             return false;
8720           }
8721         else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8722                  && !useless_type_conversion_p (vectype,
8723                                                 SLP_TREE_VECTYPE (child)))
8724           {
8725             /* With bools we can have mask and non-mask precision vectors
8726                or different non-mask precisions.  while pattern recog is
8727                supposed to guarantee consistency here bugs in it can cause
8728                mismatches (PR103489 and PR103800 for example).
8729                Deal with them here instead of ICEing later.  */
8730             if (dump_enabled_p ())
8731               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8732                                "incompatible vector type setup from "
8733                                "bool pattern detection\n");
8734             return false;
8735           }
8736
8737       /* For single-argument PHIs assume coalescing which means zero cost
8738          for the scalar and the vector PHIs.  This avoids artificially
8739          favoring the vector path (but may pessimize it in some cases).  */
8740       if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
8741         record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
8742                           vector_stmt, stmt_info, vectype, 0, vect_body);
8743       STMT_VINFO_TYPE (stmt_info) = phi_info_type;
8744       return true;
8745     }
8746
8747   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8748   basic_block bb = gimple_bb (stmt_info->stmt);
8749   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8750   auto_vec<gphi *> new_phis;
8751   for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
8752     {
8753       slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8754
8755       /* Skip not yet vectorized defs.  */
8756       if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8757           && SLP_TREE_VEC_DEFS (child).is_empty ())
8758         continue;
8759
8760       auto_vec<tree> vec_oprnds;
8761       vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
8762       if (!new_phis.exists ())
8763         {
8764           new_phis.create (vec_oprnds.length ());
8765           for (unsigned j = 0; j < vec_oprnds.length (); j++)
8766             {
8767               /* Create the vectorized LC PHI node.  */
8768               new_phis.quick_push (create_phi_node (vec_dest, bb));
8769               slp_node->push_vec_def (new_phis[j]);
8770             }
8771         }
8772       edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
8773       for (unsigned j = 0; j < vec_oprnds.length (); j++)
8774         add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
8775     }
8776   /* We should have at least one already vectorized child.  */
8777   gcc_assert (new_phis.exists ());
8778
8779   return true;
8780 }
8781
8782 /* Vectorizes first order recurrences.  An overview of the transformation
8783    is described below. Suppose we have the following loop.
8784
8785      int t = 0;
8786      for (int i = 0; i < n; ++i)
8787        {
8788          b[i] = a[i] - t;
8789          t = a[i];
8790        }
8791
8792    There is a first-order recurrence on 'a'. For this loop, the scalar IR
8793    looks (simplified) like:
8794
8795     scalar.preheader:
8796       init = 0;
8797
8798     scalar.body:
8799       i = PHI <0(scalar.preheader), i+1(scalar.body)>
8800       _2 = PHI <(init(scalar.preheader), <_1(scalar.body)>
8801       _1 = a[i]
8802       b[i] = _1 - _2
8803       if (i < n) goto scalar.body
8804
8805    In this example, _2 is a recurrence because it's value depends on the
8806    previous iteration.  We vectorize this as (VF = 4)
8807
8808     vector.preheader:
8809       vect_init = vect_cst(..., ..., ..., 0)
8810
8811     vector.body
8812       i = PHI <0(vector.preheader), i+4(vector.body)>
8813       vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)>
8814       vect_2 = a[i, i+1, i+2, i+3];
8815       vect_3 = vec_perm (vect_1, vect_2, { 3, 4, 5, 6 })
8816       b[i, i+1, i+2, i+3] = vect_2 - vect_3
8817       if (..) goto vector.body
8818
8819    In this function, vectorizable_recurr, we code generate both the
8820    vector PHI node and the permute since those together compute the
8821    vectorized value of the scalar PHI.  We do not yet have the
8822    backedge value to fill in there nor into the vec_perm.  Those
8823    are filled in maybe_set_vectorized_backedge_value and
8824    vect_schedule_scc.
8825
8826    TODO:  Since the scalar loop does not have a use of the recurrence
8827    outside of the loop the natural way to implement peeling via
8828    vectorizing the live value doesn't work.  For now peeling of loops
8829    with a recurrence is not implemented.  For SLP the supported cases
8830    are restricted to those requiring a single vector recurrence PHI.  */
8831
8832 bool
8833 vectorizable_recurr (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8834                      gimple **vec_stmt, slp_tree slp_node,
8835                      stmt_vector_for_cost *cost_vec)
8836 {
8837   if (!loop_vinfo || !is_a<gphi *> (stmt_info->stmt))
8838     return false;
8839
8840   gphi *phi = as_a<gphi *> (stmt_info->stmt);
8841
8842   /* So far we only support first-order recurrence auto-vectorization.  */
8843   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
8844     return false;
8845
8846   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8847   unsigned ncopies;
8848   if (slp_node)
8849     ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8850   else
8851     ncopies = vect_get_num_copies (loop_vinfo, vectype);
8852   poly_int64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8853   unsigned dist = slp_node ? SLP_TREE_LANES (slp_node) : 1;
8854   /* We need to be able to make progress with a single vector.  */
8855   if (maybe_gt (dist * 2, nunits))
8856     {
8857       if (dump_enabled_p ())
8858         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8859                          "first order recurrence exceeds half of "
8860                          "a vector\n");
8861       return false;
8862     }
8863
8864   /* First-order recurrence autovectorization needs to handle permutation
8865      with indices = [nunits-1, nunits, nunits+1, ...].  */
8866   vec_perm_builder sel (nunits, 1, 3);
8867   for (int i = 0; i < 3; ++i)
8868     sel.quick_push (nunits - dist + i);
8869   vec_perm_indices indices (sel, 2, nunits);
8870
8871   if (!vec_stmt) /* transformation not required.  */
8872     {
8873       if (!can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
8874                                  indices))
8875         return false;
8876
8877       if (slp_node)
8878         {
8879           /* We eventually need to set a vector type on invariant
8880              arguments.  */
8881           unsigned j;
8882           slp_tree child;
8883           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
8884             if (!vect_maybe_update_slp_op_vectype
8885                   (child, SLP_TREE_VECTYPE (slp_node)))
8886               {
8887                 if (dump_enabled_p ())
8888                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8889                                    "incompatible vector types for "
8890                                    "invariants\n");
8891                 return false;
8892               }
8893         }
8894       /* The recurrence costs the initialization vector and one permute
8895          for each copy.  */
8896       unsigned prologue_cost = record_stmt_cost (cost_vec, 1, scalar_to_vec,
8897                                                  stmt_info, 0, vect_prologue);
8898       unsigned inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
8899                                                stmt_info, 0, vect_body);
8900       if (dump_enabled_p ())
8901         dump_printf_loc (MSG_NOTE, vect_location,
8902                          "vectorizable_recurr: inside_cost = %d, "
8903                          "prologue_cost = %d .\n", inside_cost,
8904                          prologue_cost);
8905
8906       STMT_VINFO_TYPE (stmt_info) = recurr_info_type;
8907       return true;
8908     }
8909
8910   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
8911   basic_block bb = gimple_bb (phi);
8912   tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
8913   if (!useless_type_conversion_p (TREE_TYPE (vectype), TREE_TYPE (preheader)))
8914     {
8915       gimple_seq stmts = NULL;
8916       preheader = gimple_convert (&stmts, TREE_TYPE (vectype), preheader);
8917       gsi_insert_seq_on_edge_immediate (pe, stmts);
8918     }
8919   tree vec_init = build_vector_from_val (vectype, preheader);
8920   vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL);
8921
8922   /* Create the vectorized first-order PHI node.  */
8923   tree vec_dest = vect_get_new_vect_var (vectype,
8924                                          vect_simple_var, "vec_recur_");
8925   gphi *new_phi = create_phi_node (vec_dest, bb);
8926   add_phi_arg (new_phi, vec_init, pe, UNKNOWN_LOCATION);
8927
8928   /* Insert shuffles the first-order recurrence autovectorization.
8929        result = VEC_PERM <vec_recur, vect_1, index[nunits-1, nunits, ...]>.  */
8930   tree perm = vect_gen_perm_mask_checked (vectype, indices);
8931
8932   /* Insert the required permute after the latch definition.  The
8933      second and later operands are tentative and will be updated when we have
8934      vectorized the latch definition.  */
8935   edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
8936   gimple *latch_def = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, le));
8937   gimple_stmt_iterator gsi2 = gsi_for_stmt (latch_def);
8938   gsi_next (&gsi2);
8939
8940   for (unsigned i = 0; i < ncopies; ++i)
8941     {
8942       vec_dest = make_ssa_name (vectype);
8943       gassign *vperm
8944           = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
8945                                  i == 0 ? gimple_phi_result (new_phi) : NULL,
8946                                  NULL, perm);
8947       vect_finish_stmt_generation (loop_vinfo, stmt_info, vperm, &gsi2);
8948
8949       if (slp_node)
8950         slp_node->push_vec_def (vperm);
8951       else
8952         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (vperm);
8953     }
8954
8955   if (!slp_node)
8956     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8957   return true;
8958 }
8959
8960 /* Return true if VECTYPE represents a vector that requires lowering
8961    by the vector lowering pass.  */
8962
8963 bool
8964 vect_emulated_vector_p (tree vectype)
8965 {
8966   return (!VECTOR_MODE_P (TYPE_MODE (vectype))
8967           && (!VECTOR_BOOLEAN_TYPE_P (vectype)
8968               || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
8969 }
8970
8971 /* Return true if we can emulate CODE on an integer mode representation
8972    of a vector.  */
8973
8974 bool
8975 vect_can_vectorize_without_simd_p (tree_code code)
8976 {
8977   switch (code)
8978     {
8979     case PLUS_EXPR:
8980     case MINUS_EXPR:
8981     case NEGATE_EXPR:
8982     case BIT_AND_EXPR:
8983     case BIT_IOR_EXPR:
8984     case BIT_XOR_EXPR:
8985     case BIT_NOT_EXPR:
8986       return true;
8987
8988     default:
8989       return false;
8990     }
8991 }
8992
8993 /* Likewise, but taking a code_helper.  */
8994
8995 bool
8996 vect_can_vectorize_without_simd_p (code_helper code)
8997 {
8998   return (code.is_tree_code ()
8999           && vect_can_vectorize_without_simd_p (tree_code (code)));
9000 }
9001
9002 /* Create vector init for vectorized iv.  */
9003 static tree
9004 vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9005                                tree step_expr, poly_uint64 nunits,
9006                                tree vectype,
9007                                enum vect_induction_op_type induction_type)
9008 {
9009   unsigned HOST_WIDE_INT const_nunits;
9010   tree vec_shift, vec_init, new_name;
9011   unsigned i;
9012   tree itype = TREE_TYPE (vectype);
9013
9014   /* iv_loop is the loop to be vectorized. Create:
9015      vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr).  */
9016   new_name = gimple_convert (stmts, itype, init_expr);
9017   switch (induction_type)
9018     {
9019     case vect_step_op_shr:
9020     case vect_step_op_shl:
9021       /* Build the Initial value from shift_expr.  */
9022       vec_init = gimple_build_vector_from_val (stmts,
9023                                                vectype,
9024                                                new_name);
9025       vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
9026                                 build_zero_cst (itype), step_expr);
9027       vec_init = gimple_build (stmts,
9028                                (induction_type == vect_step_op_shr
9029                                 ? RSHIFT_EXPR : LSHIFT_EXPR),
9030                                vectype, vec_init, vec_shift);
9031       break;
9032
9033     case vect_step_op_neg:
9034       {
9035         vec_init = gimple_build_vector_from_val (stmts,
9036                                                  vectype,
9037                                                  new_name);
9038         tree vec_neg = gimple_build (stmts, NEGATE_EXPR,
9039                                      vectype, vec_init);
9040         /* The encoding has 2 interleaved stepped patterns.  */
9041         vec_perm_builder sel (nunits, 2, 3);
9042         sel.quick_grow (6);
9043         for (i = 0; i < 3; i++)
9044           {
9045             sel[2 * i] = i;
9046             sel[2 * i + 1] = i + nunits;
9047           }
9048         vec_perm_indices indices (sel, 2, nunits);
9049         /* Don't use vect_gen_perm_mask_checked since can_vec_perm_const_p may
9050            fail when vec_init is const vector. In that situation vec_perm is not
9051            really needed.  */
9052         tree perm_mask_even
9053           = vect_gen_perm_mask_any (vectype, indices);
9054         vec_init = gimple_build (stmts, VEC_PERM_EXPR,
9055                                  vectype,
9056                                  vec_init, vec_neg,
9057                                  perm_mask_even);
9058       }
9059       break;
9060
9061     case vect_step_op_mul:
9062       {
9063         /* Use unsigned mult to avoid UD integer overflow.  */
9064         gcc_assert (nunits.is_constant (&const_nunits));
9065         tree utype = unsigned_type_for (itype);
9066         tree uvectype = build_vector_type (utype,
9067                                            TYPE_VECTOR_SUBPARTS (vectype));
9068         new_name = gimple_convert (stmts, utype, new_name);
9069         vec_init = gimple_build_vector_from_val (stmts,
9070                                                  uvectype,
9071                                                  new_name);
9072         tree_vector_builder elts (uvectype, const_nunits, 1);
9073         tree elt_step = build_one_cst (utype);
9074
9075         elts.quick_push (elt_step);
9076         for (i = 1; i < const_nunits; i++)
9077           {
9078             /* Create: new_name_i = new_name + step_expr.  */
9079             elt_step = gimple_build (stmts, MULT_EXPR,
9080                                      utype, elt_step, step_expr);
9081             elts.quick_push (elt_step);
9082           }
9083         /* Create a vector from [new_name_0, new_name_1, ...,
9084            new_name_nunits-1].  */
9085         tree vec_mul = gimple_build_vector (stmts, &elts);
9086         vec_init = gimple_build (stmts, MULT_EXPR, uvectype,
9087                                  vec_init, vec_mul);
9088         vec_init = gimple_convert (stmts, vectype, vec_init);
9089       }
9090       break;
9091
9092     default:
9093       gcc_unreachable ();
9094     }
9095
9096   return vec_init;
9097 }
9098
9099 /* Peel init_expr by skip_niter for induction_type.  */
9100 tree
9101 vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9102                              tree skip_niters, tree step_expr,
9103                              enum vect_induction_op_type induction_type)
9104 {
9105   gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST);
9106   tree type = TREE_TYPE (init_expr);
9107   unsigned prec = TYPE_PRECISION (type);
9108   switch (induction_type)
9109     {
9110     case vect_step_op_neg:
9111       if (TREE_INT_CST_LOW (skip_niters) % 2)
9112         init_expr = gimple_build (stmts, NEGATE_EXPR, type, init_expr);
9113       /* else no change.  */
9114       break;
9115
9116     case vect_step_op_shr:
9117     case vect_step_op_shl:
9118       skip_niters = gimple_convert (stmts, type, skip_niters);
9119       step_expr = gimple_build (stmts, MULT_EXPR, type, step_expr, skip_niters);
9120       /* When shift mount >= precision, need to avoid UD.
9121          In the original loop, there's no UD, and according to semantic,
9122          init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr.  */
9123       if (!tree_fits_uhwi_p (step_expr)
9124           || tree_to_uhwi (step_expr) >= prec)
9125         {
9126           if (induction_type == vect_step_op_shl
9127               || TYPE_UNSIGNED (type))
9128             init_expr = build_zero_cst (type);
9129           else
9130             init_expr = gimple_build (stmts, RSHIFT_EXPR, type,
9131                                       init_expr,
9132                                       wide_int_to_tree (type, prec - 1));
9133         }
9134       else
9135         init_expr = gimple_build (stmts, (induction_type == vect_step_op_shr
9136                                           ? RSHIFT_EXPR : LSHIFT_EXPR),
9137                                   type, init_expr, step_expr);
9138       break;
9139
9140     case vect_step_op_mul:
9141       {
9142         tree utype = unsigned_type_for (type);
9143         init_expr = gimple_convert (stmts, utype, init_expr);
9144         unsigned skipn = TREE_INT_CST_LOW (skip_niters);
9145         wide_int begin = wi::to_wide (step_expr);
9146         for (unsigned i = 0; i != skipn - 1; i++)
9147           begin = wi::mul (begin, wi::to_wide (step_expr));
9148         tree mult_expr = wide_int_to_tree (utype, begin);
9149         init_expr = gimple_build (stmts, MULT_EXPR, utype, init_expr, mult_expr);
9150         init_expr = gimple_convert (stmts, type, init_expr);
9151       }
9152       break;
9153
9154     default:
9155       gcc_unreachable ();
9156     }
9157
9158   return init_expr;
9159 }
9160
9161 /* Create vector step for vectorized iv.  */
9162 static tree
9163 vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
9164                                poly_uint64 vf,
9165                                enum vect_induction_op_type induction_type)
9166 {
9167   tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
9168   tree new_name = NULL;
9169   /* Step should be pow (step, vf) for mult induction.  */
9170   if (induction_type == vect_step_op_mul)
9171     {
9172       gcc_assert (vf.is_constant ());
9173       wide_int begin = wi::to_wide (step_expr);
9174
9175       for (unsigned i = 0; i != vf.to_constant () - 1; i++)
9176         begin = wi::mul (begin, wi::to_wide (step_expr));
9177
9178       new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin);
9179     }
9180   else if (induction_type == vect_step_op_neg)
9181     /* Do nothing.  */
9182     ;
9183   else
9184     new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr),
9185                              expr, step_expr);
9186   return new_name;
9187 }
9188
9189 static tree
9190 vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
9191                                    stmt_vec_info stmt_info,
9192                                    tree new_name, tree vectype,
9193                                    enum vect_induction_op_type induction_type)
9194 {
9195   /* No step is needed for neg induction.  */
9196   if (induction_type == vect_step_op_neg)
9197     return NULL;
9198
9199   tree t = unshare_expr (new_name);
9200   gcc_assert (CONSTANT_CLASS_P (new_name)
9201               || TREE_CODE (new_name) == SSA_NAME);
9202   tree new_vec = build_vector_from_val (vectype, t);
9203   tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
9204                                     new_vec, vectype, NULL);
9205   return vec_step;
9206 }
9207
9208 /* Update vectorized iv with vect_step, induc_def is init.  */
9209 static tree
9210 vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
9211                           tree induc_def, tree vec_step,
9212                           enum vect_induction_op_type induction_type)
9213 {
9214   tree vec_def = induc_def;
9215   switch (induction_type)
9216     {
9217     case vect_step_op_mul:
9218       {
9219         /* Use unsigned mult to avoid UD integer overflow.  */
9220         tree uvectype
9221           = build_vector_type (unsigned_type_for (TREE_TYPE (vectype)),
9222                                TYPE_VECTOR_SUBPARTS (vectype));
9223         vec_def = gimple_convert (stmts, uvectype, vec_def);
9224         vec_step = gimple_convert (stmts, uvectype, vec_step);
9225         vec_def = gimple_build (stmts, MULT_EXPR, uvectype,
9226                                 vec_def, vec_step);
9227         vec_def = gimple_convert (stmts, vectype, vec_def);
9228       }
9229       break;
9230
9231     case vect_step_op_shr:
9232       vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype,
9233                               vec_def, vec_step);
9234       break;
9235
9236     case vect_step_op_shl:
9237       vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype,
9238                               vec_def, vec_step);
9239       break;
9240     case vect_step_op_neg:
9241       vec_def = induc_def;
9242       /* Do nothing.  */
9243       break;
9244     default:
9245       gcc_unreachable ();
9246     }
9247
9248   return vec_def;
9249
9250 }
9251
9252 /* Function vectorizable_induction
9253
9254    Check if STMT_INFO performs an nonlinear induction computation that can be
9255    vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
9256    a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
9257    basic block.
9258    Return true if STMT_INFO is vectorizable in this way.  */
9259
9260 static bool
9261 vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
9262                                   stmt_vec_info stmt_info,
9263                                   gimple **vec_stmt, slp_tree slp_node,
9264                                   stmt_vector_for_cost *cost_vec)
9265 {
9266   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9267   unsigned ncopies;
9268   bool nested_in_vect_loop = false;
9269   class loop *iv_loop;
9270   tree vec_def;
9271   edge pe = loop_preheader_edge (loop);
9272   basic_block new_bb;
9273   tree vec_init, vec_step;
9274   tree new_name;
9275   gimple *new_stmt;
9276   gphi *induction_phi;
9277   tree induc_def, vec_dest;
9278   tree init_expr, step_expr;
9279   tree niters_skip;
9280   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9281   unsigned i;
9282   gimple_stmt_iterator si;
9283
9284   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9285
9286   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9287   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9288   enum vect_induction_op_type induction_type
9289     = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9290
9291   gcc_assert (induction_type > vect_step_op_add);
9292
9293   if (slp_node)
9294     ncopies = 1;
9295   else
9296     ncopies = vect_get_num_copies (loop_vinfo, vectype);
9297   gcc_assert (ncopies >= 1);
9298
9299   /* FORNOW. Only handle nonlinear induction in the same loop.  */
9300   if (nested_in_vect_loop_p (loop, stmt_info))
9301     {
9302       if (dump_enabled_p ())
9303         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9304                          "nonlinear induction in nested loop.\n");
9305       return false;
9306     }
9307
9308   iv_loop = loop;
9309   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9310
9311   /* TODO: Support slp for nonlinear iv. There should be separate vector iv
9312      update for each iv and a permutation to generate wanted vector iv.  */
9313   if (slp_node)
9314     {
9315       if (dump_enabled_p ())
9316         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9317                          "SLP induction not supported for nonlinear"
9318                          " induction.\n");
9319       return false;
9320     }
9321
9322   if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
9323     {
9324       if (dump_enabled_p ())
9325         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9326                          "floating point nonlinear induction vectorization"
9327                          " not supported.\n");
9328       return false;
9329     }
9330
9331   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9332   init_expr = vect_phi_initial_value (phi);
9333   gcc_assert (step_expr != NULL_TREE && init_expr != NULL
9334               && TREE_CODE (step_expr) == INTEGER_CST);
9335   /* step_expr should be aligned with init_expr,
9336      .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used.  */
9337   step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
9338
9339   if (TREE_CODE (init_expr) == INTEGER_CST)
9340     init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
9341   else
9342     gcc_assert (tree_nop_conversion_p (TREE_TYPE (vectype),
9343                                        TREE_TYPE (init_expr)));
9344
9345   switch (induction_type)
9346     {
9347     case vect_step_op_neg:
9348       if (TREE_CODE (init_expr) != INTEGER_CST
9349           && TREE_CODE (init_expr) != REAL_CST)
9350         {
9351           /* Check for backend support of NEGATE_EXPR and vec_perm.  */
9352           if (!directly_supported_p (NEGATE_EXPR, vectype))
9353             return false;
9354
9355           /* The encoding has 2 interleaved stepped patterns.  */
9356           vec_perm_builder sel (nunits, 2, 3);
9357           machine_mode mode = TYPE_MODE (vectype);
9358           sel.quick_grow (6);
9359           for (i = 0; i < 3; i++)
9360             {
9361               sel[i * 2] = i;
9362               sel[i * 2 + 1] = i + nunits;
9363             }
9364           vec_perm_indices indices (sel, 2, nunits);
9365           if (!can_vec_perm_const_p (mode, mode, indices))
9366             return false;
9367         }
9368       break;
9369
9370     case vect_step_op_mul:
9371       {
9372         /* Check for backend support of MULT_EXPR.  */
9373         if (!directly_supported_p (MULT_EXPR, vectype))
9374           return false;
9375
9376         /* ?? How to construct vector step for variable number vector.
9377            [ 1, step, pow (step, 2), pow (step, 4), .. ].  */
9378         if (!vf.is_constant ())
9379           return false;
9380       }
9381       break;
9382
9383     case vect_step_op_shr:
9384       /* Check for backend support of RSHIFT_EXPR.  */
9385       if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
9386         return false;
9387
9388       /* Don't shift more than type precision to avoid UD.  */
9389       if (!tree_fits_uhwi_p (step_expr)
9390           || maybe_ge (nunits * tree_to_uhwi (step_expr),
9391                        TYPE_PRECISION (TREE_TYPE (init_expr))))
9392         return false;
9393       break;
9394
9395     case vect_step_op_shl:
9396       /* Check for backend support of RSHIFT_EXPR.  */
9397       if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
9398         return false;
9399
9400       /* Don't shift more than type precision to avoid UD.  */
9401       if (!tree_fits_uhwi_p (step_expr)
9402           || maybe_ge (nunits * tree_to_uhwi (step_expr),
9403                        TYPE_PRECISION (TREE_TYPE (init_expr))))
9404         return false;
9405
9406       break;
9407
9408     default:
9409       gcc_unreachable ();
9410     }
9411
9412   if (!vec_stmt) /* transformation not required.  */
9413     {
9414       unsigned inside_cost = 0, prologue_cost = 0;
9415       /* loop cost for vec_loop. Neg induction doesn't have any
9416          inside_cost.  */
9417       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9418                                       stmt_info, 0, vect_body);
9419
9420       /* loop cost for vec_loop. Neg induction doesn't have any
9421          inside_cost.  */
9422       if (induction_type == vect_step_op_neg)
9423         inside_cost = 0;
9424
9425       /* prologue cost for vec_init and vec_step.  */
9426       prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9427                                         stmt_info, 0, vect_prologue);
9428
9429       if (dump_enabled_p ())
9430         dump_printf_loc (MSG_NOTE, vect_location,
9431                          "vect_model_induction_cost: inside_cost = %d, "
9432                          "prologue_cost = %d. \n", inside_cost,
9433                          prologue_cost);
9434
9435       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9436       DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
9437       return true;
9438     }
9439
9440   /* Transform.  */
9441
9442   /* Compute a vector variable, initialized with the first VF values of
9443      the induction variable.  E.g., for an iv with IV_PHI='X' and
9444      evolution S, for a vector of 4 units, we want to compute:
9445      [X, X + S, X + 2*S, X + 3*S].  */
9446
9447   if (dump_enabled_p ())
9448     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9449
9450   pe = loop_preheader_edge (iv_loop);
9451   /* Find the first insertion point in the BB.  */
9452   basic_block bb = gimple_bb (phi);
9453   si = gsi_after_labels (bb);
9454
9455   gimple_seq stmts = NULL;
9456
9457   niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9458   /* If we are using the loop mask to "peel" for alignment then we need
9459      to adjust the start value here.  */
9460   if (niters_skip != NULL_TREE)
9461     init_expr = vect_peel_nonlinear_iv_init (&stmts, init_expr, niters_skip,
9462                                              step_expr, induction_type);
9463
9464   vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr,
9465                                             step_expr, nunits, vectype,
9466                                             induction_type);
9467   if (stmts)
9468     {
9469       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9470       gcc_assert (!new_bb);
9471     }
9472
9473   stmts = NULL;
9474   new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9475                                             vf, induction_type);
9476   if (stmts)
9477     {
9478       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9479       gcc_assert (!new_bb);
9480     }
9481
9482   vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9483                                                 new_name, vectype,
9484                                                 induction_type);
9485   /* Create the following def-use cycle:
9486      loop prolog:
9487      vec_init = ...
9488      vec_step = ...
9489      loop:
9490      vec_iv = PHI <vec_init, vec_loop>
9491      ...
9492      STMT
9493      ...
9494      vec_loop = vec_iv + vec_step;  */
9495
9496   /* Create the induction-phi that defines the induction-operand.  */
9497   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
9498   induction_phi = create_phi_node (vec_dest, iv_loop->header);
9499   induc_def = PHI_RESULT (induction_phi);
9500
9501   /* Create the iv update inside the loop.  */
9502   stmts = NULL;
9503   vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9504                                       induc_def, vec_step,
9505                                       induction_type);
9506
9507   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9508   new_stmt = SSA_NAME_DEF_STMT (vec_def);
9509
9510   /* Set the arguments of the phi node:  */
9511   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9512   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9513                UNKNOWN_LOCATION);
9514
9515   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
9516   *vec_stmt = induction_phi;
9517
9518   /* In case that vectorization factor (VF) is bigger than the number
9519      of elements that we can fit in a vectype (nunits), we have to generate
9520      more than one vector stmt - i.e - we need to "unroll" the
9521      vector stmt by a factor VF/nunits.  For more details see documentation
9522      in vectorizable_operation.  */
9523
9524   if (ncopies > 1)
9525     {
9526       stmts = NULL;
9527       /* FORNOW. This restriction should be relaxed.  */
9528       gcc_assert (!nested_in_vect_loop);
9529
9530       new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9531                                                 nunits, induction_type);
9532
9533       vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9534                                                     new_name, vectype,
9535                                                     induction_type);
9536       vec_def = induc_def;
9537       for (i = 1; i < ncopies; i++)
9538         {
9539           /* vec_i = vec_prev + vec_step.  */
9540           stmts = NULL;
9541           vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9542                                               vec_def, vec_step,
9543                                               induction_type);
9544           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9545           new_stmt = SSA_NAME_DEF_STMT (vec_def);
9546           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
9547         }
9548     }
9549
9550   if (dump_enabled_p ())
9551     dump_printf_loc (MSG_NOTE, vect_location,
9552                      "transform induction: created def-use cycle: %G%G",
9553                      (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
9554
9555   return true;
9556 }
9557
9558 /* Function vectorizable_induction
9559
9560    Check if STMT_INFO performs an induction computation that can be vectorized.
9561    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
9562    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
9563    Return true if STMT_INFO is vectorizable in this way.  */
9564
9565 bool
9566 vectorizable_induction (loop_vec_info loop_vinfo,
9567                         stmt_vec_info stmt_info,
9568                         gimple **vec_stmt, slp_tree slp_node,
9569                         stmt_vector_for_cost *cost_vec)
9570 {
9571   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9572   unsigned ncopies;
9573   bool nested_in_vect_loop = false;
9574   class loop *iv_loop;
9575   tree vec_def;
9576   edge pe = loop_preheader_edge (loop);
9577   basic_block new_bb;
9578   tree new_vec, vec_init, vec_step, t;
9579   tree new_name;
9580   gimple *new_stmt;
9581   gphi *induction_phi;
9582   tree induc_def, vec_dest;
9583   tree init_expr, step_expr;
9584   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9585   unsigned i;
9586   tree expr;
9587   gimple_stmt_iterator si;
9588   enum vect_induction_op_type induction_type
9589     = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9590
9591   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9592   if (!phi)
9593     return false;
9594
9595   if (!STMT_VINFO_RELEVANT_P (stmt_info))
9596     return false;
9597
9598   /* Make sure it was recognized as induction computation.  */
9599   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
9600     return false;
9601
9602   /* Handle nonlinear induction in a separate place.  */
9603   if (induction_type != vect_step_op_add)
9604     return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
9605                                              vec_stmt, slp_node, cost_vec);
9606
9607   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9608   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9609
9610   if (slp_node)
9611     ncopies = 1;
9612   else
9613     ncopies = vect_get_num_copies (loop_vinfo, vectype);
9614   gcc_assert (ncopies >= 1);
9615
9616   /* FORNOW. These restrictions should be relaxed.  */
9617   if (nested_in_vect_loop_p (loop, stmt_info))
9618     {
9619       imm_use_iterator imm_iter;
9620       use_operand_p use_p;
9621       gimple *exit_phi;
9622       edge latch_e;
9623       tree loop_arg;
9624
9625       if (ncopies > 1)
9626         {
9627           if (dump_enabled_p ())
9628             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9629                              "multiple types in nested loop.\n");
9630           return false;
9631         }
9632
9633       exit_phi = NULL;
9634       latch_e = loop_latch_edge (loop->inner);
9635       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
9636       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
9637         {
9638           gimple *use_stmt = USE_STMT (use_p);
9639           if (is_gimple_debug (use_stmt))
9640             continue;
9641
9642           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
9643             {
9644               exit_phi = use_stmt;
9645               break;
9646             }
9647         }
9648       if (exit_phi)
9649         {
9650           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
9651           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
9652                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
9653             {
9654               if (dump_enabled_p ())
9655                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9656                                  "inner-loop induction only used outside "
9657                                  "of the outer vectorized loop.\n");
9658               return false;
9659             }
9660         }
9661
9662       nested_in_vect_loop = true;
9663       iv_loop = loop->inner;
9664     }
9665   else
9666     iv_loop = loop;
9667   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9668
9669   if (slp_node && !nunits.is_constant ())
9670     {
9671       /* The current SLP code creates the step value element-by-element.  */
9672       if (dump_enabled_p ())
9673         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9674                          "SLP induction not supported for variable-length"
9675                          " vectors.\n");
9676       return false;
9677     }
9678
9679   if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
9680     {
9681       if (dump_enabled_p ())
9682         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9683                          "floating point induction vectorization disabled\n");
9684       return false;
9685     }
9686
9687   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9688   gcc_assert (step_expr != NULL_TREE);
9689   tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
9690
9691   /* Check for backend support of PLUS/MINUS_EXPR. */
9692   if (!directly_supported_p (PLUS_EXPR, step_vectype)
9693       || !directly_supported_p (MINUS_EXPR, step_vectype))
9694     return false;
9695
9696   if (!vec_stmt) /* transformation not required.  */
9697     {
9698       unsigned inside_cost = 0, prologue_cost = 0;
9699       if (slp_node)
9700         {
9701           /* We eventually need to set a vector type on invariant
9702              arguments.  */
9703           unsigned j;
9704           slp_tree child;
9705           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9706             if (!vect_maybe_update_slp_op_vectype
9707                 (child, SLP_TREE_VECTYPE (slp_node)))
9708               {
9709                 if (dump_enabled_p ())
9710                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9711                                    "incompatible vector types for "
9712                                    "invariants\n");
9713                 return false;
9714               }
9715           /* loop cost for vec_loop.  */
9716           inside_cost
9717             = record_stmt_cost (cost_vec,
9718                                 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
9719                                 vector_stmt, stmt_info, 0, vect_body);
9720           /* prologue cost for vec_init (if not nested) and step.  */
9721           prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
9722                                             scalar_to_vec,
9723                                             stmt_info, 0, vect_prologue);
9724         }
9725       else /* if (!slp_node) */
9726         {
9727           /* loop cost for vec_loop.  */
9728           inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9729                                           stmt_info, 0, vect_body);
9730           /* prologue cost for vec_init and vec_step.  */
9731           prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9732                                             stmt_info, 0, vect_prologue);
9733         }
9734       if (dump_enabled_p ())
9735         dump_printf_loc (MSG_NOTE, vect_location,
9736                          "vect_model_induction_cost: inside_cost = %d, "
9737                          "prologue_cost = %d .\n", inside_cost,
9738                          prologue_cost);
9739
9740       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9741       DUMP_VECT_SCOPE ("vectorizable_induction");
9742       return true;
9743     }
9744
9745   /* Transform.  */
9746
9747   /* Compute a vector variable, initialized with the first VF values of
9748      the induction variable.  E.g., for an iv with IV_PHI='X' and
9749      evolution S, for a vector of 4 units, we want to compute:
9750      [X, X + S, X + 2*S, X + 3*S].  */
9751
9752   if (dump_enabled_p ())
9753     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9754
9755   pe = loop_preheader_edge (iv_loop);
9756   /* Find the first insertion point in the BB.  */
9757   basic_block bb = gimple_bb (phi);
9758   si = gsi_after_labels (bb);
9759
9760   /* For SLP induction we have to generate several IVs as for example
9761      with group size 3 we need
9762        [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
9763        [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2].  */
9764   if (slp_node)
9765     {
9766       /* Enforced above.  */
9767       unsigned int const_nunits = nunits.to_constant ();
9768
9769       /* The initial values are vectorized, but any lanes > group_size
9770          need adjustment.  */
9771       slp_tree init_node
9772         = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
9773
9774       /* Gather steps.  Since we do not vectorize inductions as
9775          cycles we have to reconstruct the step from SCEV data.  */
9776       unsigned group_size = SLP_TREE_LANES (slp_node);
9777       tree *steps = XALLOCAVEC (tree, group_size);
9778       tree *inits = XALLOCAVEC (tree, group_size);
9779       stmt_vec_info phi_info;
9780       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
9781         {
9782           steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
9783           if (!init_node)
9784             inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
9785                                            pe->dest_idx);
9786         }
9787
9788       /* Now generate the IVs.  */
9789       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
9790       gcc_assert ((const_nunits * nvects) % group_size == 0);
9791       unsigned nivs;
9792       if (nested_in_vect_loop)
9793         nivs = nvects;
9794       else
9795         {
9796           /* Compute the number of distinct IVs we need.  First reduce
9797              group_size if it is a multiple of const_nunits so we get
9798              one IV for a group_size of 4 but const_nunits 2.  */
9799           unsigned group_sizep = group_size;
9800           if (group_sizep % const_nunits == 0)
9801             group_sizep = group_sizep / const_nunits;
9802           nivs = least_common_multiple (group_sizep,
9803                                         const_nunits) / const_nunits;
9804         }
9805       tree stept = TREE_TYPE (step_vectype);
9806       tree lupdate_mul = NULL_TREE;
9807       if (!nested_in_vect_loop)
9808         {
9809           /* The number of iterations covered in one vector iteration.  */
9810           unsigned lup_mul = (nvects * const_nunits) / group_size;
9811           lupdate_mul
9812             = build_vector_from_val (step_vectype,
9813                                      SCALAR_FLOAT_TYPE_P (stept)
9814                                      ? build_real_from_wide (stept, lup_mul,
9815                                                              UNSIGNED)
9816                                      : build_int_cstu (stept, lup_mul));
9817         }
9818       tree peel_mul = NULL_TREE;
9819       gimple_seq init_stmts = NULL;
9820       if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
9821         {
9822           if (SCALAR_FLOAT_TYPE_P (stept))
9823             peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
9824                                      LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
9825           else
9826             peel_mul = gimple_convert (&init_stmts, stept,
9827                                        LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
9828           peel_mul = gimple_build_vector_from_val (&init_stmts,
9829                                                    step_vectype, peel_mul);
9830         }
9831       unsigned ivn;
9832       auto_vec<tree> vec_steps;
9833       for (ivn = 0; ivn < nivs; ++ivn)
9834         {
9835           tree_vector_builder step_elts (step_vectype, const_nunits, 1);
9836           tree_vector_builder init_elts (vectype, const_nunits, 1);
9837           tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
9838           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
9839             {
9840               /* The scalar steps of the IVs.  */
9841               tree elt = steps[(ivn*const_nunits + eltn) % group_size];
9842               elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
9843               step_elts.quick_push (elt);
9844               if (!init_node)
9845                 {
9846                   /* The scalar inits of the IVs if not vectorized.  */
9847                   elt = inits[(ivn*const_nunits + eltn) % group_size];
9848                   if (!useless_type_conversion_p (TREE_TYPE (vectype),
9849                                                   TREE_TYPE (elt)))
9850                     elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
9851                                         TREE_TYPE (vectype), elt);
9852                   init_elts.quick_push (elt);
9853                 }
9854               /* The number of steps to add to the initial values.  */
9855               unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
9856               mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
9857                                    ? build_real_from_wide (stept,
9858                                                            mul_elt, UNSIGNED)
9859                                    : build_int_cstu (stept, mul_elt));
9860             }
9861           vec_step = gimple_build_vector (&init_stmts, &step_elts);
9862           vec_steps.safe_push (vec_step);
9863           tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
9864           if (peel_mul)
9865             step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
9866                                      step_mul, peel_mul);
9867           if (!init_node)
9868             vec_init = gimple_build_vector (&init_stmts, &init_elts);
9869
9870           /* Create the induction-phi that defines the induction-operand.  */
9871           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
9872                                             "vec_iv_");
9873           induction_phi = create_phi_node (vec_dest, iv_loop->header);
9874           induc_def = PHI_RESULT (induction_phi);
9875
9876           /* Create the iv update inside the loop  */
9877           tree up = vec_step;
9878           if (lupdate_mul)
9879             up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9880                                vec_step, lupdate_mul);
9881           gimple_seq stmts = NULL;
9882           vec_def = gimple_convert (&stmts, step_vectype, induc_def);
9883           vec_def = gimple_build (&stmts,
9884                                   PLUS_EXPR, step_vectype, vec_def, up);
9885           vec_def = gimple_convert (&stmts, vectype, vec_def);
9886           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9887           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9888                        UNKNOWN_LOCATION);
9889
9890           if (init_node)
9891             vec_init = vect_get_slp_vect_def (init_node, ivn);
9892           if (!nested_in_vect_loop
9893               && !integer_zerop (step_mul))
9894             {
9895               vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
9896               up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9897                                  vec_step, step_mul);
9898               vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
9899                                       vec_def, up);
9900               vec_init = gimple_convert (&init_stmts, vectype, vec_def);
9901             }
9902
9903           /* Set the arguments of the phi node:  */
9904           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9905
9906           slp_node->push_vec_def (induction_phi);
9907         }
9908       if (!nested_in_vect_loop)
9909         {
9910           /* Fill up to the number of vectors we need for the whole group.  */
9911           nivs = least_common_multiple (group_size,
9912                                         const_nunits) / const_nunits;
9913           vec_steps.reserve (nivs-ivn);
9914           for (; ivn < nivs; ++ivn)
9915             {
9916               slp_node->push_vec_def (SLP_TREE_VEC_DEFS (slp_node)[0]);
9917               vec_steps.quick_push (vec_steps[0]);
9918             }
9919         }
9920
9921       /* Re-use IVs when we can.  We are generating further vector
9922          stmts by adding VF' * stride to the IVs generated above.  */
9923       if (ivn < nvects)
9924         {
9925           unsigned vfp
9926             = least_common_multiple (group_size, const_nunits) / group_size;
9927           tree lupdate_mul
9928             = build_vector_from_val (step_vectype,
9929                                      SCALAR_FLOAT_TYPE_P (stept)
9930                                      ? build_real_from_wide (stept,
9931                                                              vfp, UNSIGNED)
9932                                      : build_int_cstu (stept, vfp));
9933           for (; ivn < nvects; ++ivn)
9934             {
9935               gimple *iv
9936                 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (slp_node)[ivn - nivs]);
9937               tree def = gimple_get_lhs (iv);
9938               if (ivn < 2*nivs)
9939                 vec_steps[ivn - nivs]
9940                   = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9941                                   vec_steps[ivn - nivs], lupdate_mul);
9942               gimple_seq stmts = NULL;
9943               def = gimple_convert (&stmts, step_vectype, def);
9944               def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
9945                                   def, vec_steps[ivn % nivs]);
9946               def = gimple_convert (&stmts, vectype, def);
9947               if (gimple_code (iv) == GIMPLE_PHI)
9948                 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9949               else
9950                 {
9951                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
9952                   gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
9953                 }
9954               slp_node->push_vec_def (def);
9955             }
9956         }
9957
9958       new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
9959       gcc_assert (!new_bb);
9960
9961       return true;
9962     }
9963
9964   init_expr = vect_phi_initial_value (phi);
9965
9966   gimple_seq stmts = NULL;
9967   if (!nested_in_vect_loop)
9968     {
9969       /* Convert the initial value to the IV update type.  */
9970       tree new_type = TREE_TYPE (step_expr);
9971       init_expr = gimple_convert (&stmts, new_type, init_expr);
9972
9973       /* If we are using the loop mask to "peel" for alignment then we need
9974          to adjust the start value here.  */
9975       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9976       if (skip_niters != NULL_TREE)
9977         {
9978           if (FLOAT_TYPE_P (vectype))
9979             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
9980                                         skip_niters);
9981           else
9982             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
9983           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
9984                                          skip_niters, step_expr);
9985           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
9986                                     init_expr, skip_step);
9987         }
9988     }
9989
9990   if (stmts)
9991     {
9992       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9993       gcc_assert (!new_bb);
9994     }
9995
9996   /* Create the vector that holds the initial_value of the induction.  */
9997   if (nested_in_vect_loop)
9998     {
9999       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
10000          been created during vectorization of previous stmts.  We obtain it
10001          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
10002       auto_vec<tree> vec_inits;
10003       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
10004                                      init_expr, &vec_inits);
10005       vec_init = vec_inits[0];
10006       /* If the initial value is not of proper type, convert it.  */
10007       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
10008         {
10009           new_stmt
10010             = gimple_build_assign (vect_get_new_ssa_name (vectype,
10011                                                           vect_simple_var,
10012                                                           "vec_iv_"),
10013                                    VIEW_CONVERT_EXPR,
10014                                    build1 (VIEW_CONVERT_EXPR, vectype,
10015                                            vec_init));
10016           vec_init = gimple_assign_lhs (new_stmt);
10017           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
10018                                                  new_stmt);
10019           gcc_assert (!new_bb);
10020         }
10021     }
10022   else
10023     {
10024       /* iv_loop is the loop to be vectorized. Create:
10025          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
10026       stmts = NULL;
10027       new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
10028
10029       unsigned HOST_WIDE_INT const_nunits;
10030       if (nunits.is_constant (&const_nunits))
10031         {
10032           tree_vector_builder elts (step_vectype, const_nunits, 1);
10033           elts.quick_push (new_name);
10034           for (i = 1; i < const_nunits; i++)
10035             {
10036               /* Create: new_name_i = new_name + step_expr  */
10037               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
10038                                        new_name, step_expr);
10039               elts.quick_push (new_name);
10040             }
10041           /* Create a vector from [new_name_0, new_name_1, ...,
10042              new_name_nunits-1]  */
10043           vec_init = gimple_build_vector (&stmts, &elts);
10044         }
10045       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
10046         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
10047         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
10048                                  new_name, step_expr);
10049       else
10050         {
10051           /* Build:
10052                 [base, base, base, ...]
10053                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
10054           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
10055           gcc_assert (flag_associative_math);
10056           tree index = build_index_vector (step_vectype, 0, 1);
10057           tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10058                                                         new_name);
10059           tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10060                                                         step_expr);
10061           vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
10062           vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
10063                                    vec_init, step_vec);
10064           vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10065                                    vec_init, base_vec);
10066         }
10067       vec_init = gimple_convert (&stmts, vectype, vec_init);
10068
10069       if (stmts)
10070         {
10071           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10072           gcc_assert (!new_bb);
10073         }
10074     }
10075
10076
10077   /* Create the vector that holds the step of the induction.  */
10078   if (nested_in_vect_loop)
10079     /* iv_loop is nested in the loop to be vectorized. Generate:
10080        vec_step = [S, S, S, S]  */
10081     new_name = step_expr;
10082   else
10083     {
10084       /* iv_loop is the loop to be vectorized. Generate:
10085           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
10086       gimple_seq seq = NULL;
10087       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10088         {
10089           expr = build_int_cst (integer_type_node, vf);
10090           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10091         }
10092       else
10093         expr = build_int_cst (TREE_TYPE (step_expr), vf);
10094       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10095                                expr, step_expr);
10096       if (seq)
10097         {
10098           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10099           gcc_assert (!new_bb);
10100         }
10101     }
10102
10103   t = unshare_expr (new_name);
10104   gcc_assert (CONSTANT_CLASS_P (new_name)
10105               || TREE_CODE (new_name) == SSA_NAME);
10106   new_vec = build_vector_from_val (step_vectype, t);
10107   vec_step = vect_init_vector (loop_vinfo, stmt_info,
10108                                new_vec, step_vectype, NULL);
10109
10110
10111   /* Create the following def-use cycle:
10112      loop prolog:
10113          vec_init = ...
10114          vec_step = ...
10115      loop:
10116          vec_iv = PHI <vec_init, vec_loop>
10117          ...
10118          STMT
10119          ...
10120          vec_loop = vec_iv + vec_step;  */
10121
10122   /* Create the induction-phi that defines the induction-operand.  */
10123   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
10124   induction_phi = create_phi_node (vec_dest, iv_loop->header);
10125   induc_def = PHI_RESULT (induction_phi);
10126
10127   /* Create the iv update inside the loop  */
10128   stmts = NULL;
10129   vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10130   vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
10131   vec_def = gimple_convert (&stmts, vectype, vec_def);
10132   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10133   new_stmt = SSA_NAME_DEF_STMT (vec_def);
10134
10135   /* Set the arguments of the phi node:  */
10136   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10137   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10138                UNKNOWN_LOCATION);
10139
10140   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
10141   *vec_stmt = induction_phi;
10142
10143   /* In case that vectorization factor (VF) is bigger than the number
10144      of elements that we can fit in a vectype (nunits), we have to generate
10145      more than one vector stmt - i.e - we need to "unroll" the
10146      vector stmt by a factor VF/nunits.  For more details see documentation
10147      in vectorizable_operation.  */
10148
10149   if (ncopies > 1)
10150     {
10151       gimple_seq seq = NULL;
10152       /* FORNOW. This restriction should be relaxed.  */
10153       gcc_assert (!nested_in_vect_loop);
10154
10155       /* Create the vector that holds the step of the induction.  */
10156       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10157         {
10158           expr = build_int_cst (integer_type_node, nunits);
10159           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10160         }
10161       else
10162         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
10163       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10164                                expr, step_expr);
10165       if (seq)
10166         {
10167           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10168           gcc_assert (!new_bb);
10169         }
10170
10171       t = unshare_expr (new_name);
10172       gcc_assert (CONSTANT_CLASS_P (new_name)
10173                   || TREE_CODE (new_name) == SSA_NAME);
10174       new_vec = build_vector_from_val (step_vectype, t);
10175       vec_step = vect_init_vector (loop_vinfo, stmt_info,
10176                                    new_vec, step_vectype, NULL);
10177
10178       vec_def = induc_def;
10179       for (i = 1; i < ncopies + 1; i++)
10180         {
10181           /* vec_i = vec_prev + vec_step  */
10182           gimple_seq stmts = NULL;
10183           vec_def = gimple_convert (&stmts, step_vectype, vec_def);
10184           vec_def = gimple_build (&stmts,
10185                                   PLUS_EXPR, step_vectype, vec_def, vec_step);
10186           vec_def = gimple_convert (&stmts, vectype, vec_def);
10187
10188           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10189           if (i < ncopies)
10190             {
10191               new_stmt = SSA_NAME_DEF_STMT (vec_def);
10192               STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
10193             }
10194           else
10195             {
10196               /* vec_1 = vec_iv + (VF/n * S)
10197                  vec_2 = vec_1 + (VF/n * S)
10198                  ...
10199                  vec_n = vec_prev + (VF/n * S) = vec_iv + VF * S = vec_loop
10200
10201                  vec_n is used as vec_loop to save the large step register and
10202                  related operations.  */
10203               add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10204                            UNKNOWN_LOCATION);
10205             }
10206         }
10207     }
10208
10209   if (dump_enabled_p ())
10210     dump_printf_loc (MSG_NOTE, vect_location,
10211                      "transform induction: created def-use cycle: %G%G",
10212                      (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
10213
10214   return true;
10215 }
10216
10217 /* Function vectorizable_live_operation.
10218
10219    STMT_INFO computes a value that is used outside the loop.  Check if
10220    it can be supported.  */
10221
10222 bool
10223 vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
10224                              slp_tree slp_node, slp_instance slp_node_instance,
10225                              int slp_index, bool vec_stmt_p,
10226                              stmt_vector_for_cost *cost_vec)
10227 {
10228   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
10229   imm_use_iterator imm_iter;
10230   tree lhs, lhs_type, bitsize;
10231   tree vectype = (slp_node
10232                   ? SLP_TREE_VECTYPE (slp_node)
10233                   : STMT_VINFO_VECTYPE (stmt_info));
10234   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10235   int ncopies;
10236   gimple *use_stmt;
10237   auto_vec<tree> vec_oprnds;
10238   int vec_entry = 0;
10239   poly_uint64 vec_index = 0;
10240
10241   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
10242
10243   /* If a stmt of a reduction is live, vectorize it via
10244      vect_create_epilog_for_reduction.  vectorizable_reduction assessed
10245      validity so just trigger the transform here.  */
10246   if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
10247     {
10248       if (!vec_stmt_p)
10249         return true;
10250       if (slp_node)
10251         {
10252           /* For reduction chains the meta-info is attached to
10253              the group leader.  */
10254           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
10255             stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
10256           /* For SLP reductions we vectorize the epilogue for
10257              all involved stmts together.  */
10258           else if (slp_index != 0)
10259             return true;
10260         }
10261       stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
10262       gcc_assert (reduc_info->is_reduc_info);
10263       if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
10264           || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
10265         return true;
10266       vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
10267                                         slp_node_instance);
10268       return true;
10269     }
10270
10271   /* If STMT is not relevant and it is a simple assignment and its inputs are
10272      invariant then it can remain in place, unvectorized.  The original last
10273      scalar value that it computes will be used.  */
10274   if (!STMT_VINFO_RELEVANT_P (stmt_info))
10275     {
10276       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
10277       if (dump_enabled_p ())
10278         dump_printf_loc (MSG_NOTE, vect_location,
10279                          "statement is simple and uses invariant.  Leaving in "
10280                          "place.\n");
10281       return true;
10282     }
10283
10284   if (slp_node)
10285     ncopies = 1;
10286   else
10287     ncopies = vect_get_num_copies (loop_vinfo, vectype);
10288
10289   if (slp_node)
10290     {
10291       gcc_assert (slp_index >= 0);
10292
10293       /* Get the last occurrence of the scalar index from the concatenation of
10294          all the slp vectors. Calculate which slp vector it is and the index
10295          within.  */
10296       int num_scalar = SLP_TREE_LANES (slp_node);
10297       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10298       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
10299
10300       /* Calculate which vector contains the result, and which lane of
10301          that vector we need.  */
10302       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
10303         {
10304           if (dump_enabled_p ())
10305             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10306                              "Cannot determine which vector holds the"
10307                              " final result.\n");
10308           return false;
10309         }
10310     }
10311
10312   if (!vec_stmt_p)
10313     {
10314       /* No transformation required.  */
10315       if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
10316         {
10317           if (slp_node)
10318             {
10319               if (dump_enabled_p ())
10320                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10321                                  "can't operate on partial vectors "
10322                                  "because an SLP statement is live after "
10323                                  "the loop.\n");
10324               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10325             }
10326           else if (ncopies > 1)
10327             {
10328               if (dump_enabled_p ())
10329                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10330                                  "can't operate on partial vectors "
10331                                  "because ncopies is greater than 1.\n");
10332               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10333             }
10334           else
10335             {
10336               gcc_assert (ncopies == 1 && !slp_node);
10337               if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
10338                                                   OPTIMIZE_FOR_SPEED))
10339                 vect_record_loop_mask (loop_vinfo,
10340                                        &LOOP_VINFO_MASKS (loop_vinfo),
10341                                        1, vectype, NULL);
10342               else if (can_vec_extract_var_idx_p (
10343                          TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
10344                 vect_record_loop_len (loop_vinfo,
10345                                       &LOOP_VINFO_LENS (loop_vinfo),
10346                                       1, vectype, 1);
10347               else
10348                 {
10349                   if (dump_enabled_p ())
10350                     dump_printf_loc (
10351                       MSG_MISSED_OPTIMIZATION, vect_location,
10352                       "can't operate on partial vectors "
10353                       "because the target doesn't support extract "
10354                       "last reduction.\n");
10355                   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10356                 }
10357             }
10358         }
10359       /* ???  Enable for loop costing as well.  */
10360       if (!loop_vinfo)
10361         record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
10362                           0, vect_epilogue);
10363       return true;
10364     }
10365
10366   /* Use the lhs of the original scalar statement.  */
10367   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
10368   if (dump_enabled_p ())
10369     dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
10370                      "stmt %G", stmt);
10371
10372   lhs = gimple_get_lhs (stmt);
10373   lhs_type = TREE_TYPE (lhs);
10374
10375   bitsize = vector_element_bits_tree (vectype);
10376
10377   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
10378   tree vec_lhs, bitstart;
10379   gimple *vec_stmt;
10380   if (slp_node)
10381     {
10382       gcc_assert (!loop_vinfo
10383                   || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
10384                       && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)));
10385
10386       /* Get the correct slp vectorized stmt.  */
10387       vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
10388       vec_stmt = SSA_NAME_DEF_STMT (vec_lhs);
10389
10390       /* Get entry to use.  */
10391       bitstart = bitsize_int (vec_index);
10392       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
10393     }
10394   else
10395     {
10396       /* For multiple copies, get the last copy.  */
10397       vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
10398       vec_lhs = gimple_get_lhs (vec_stmt);
10399
10400       /* Get the last lane in the vector.  */
10401       bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
10402     }
10403
10404   if (loop_vinfo)
10405     {
10406       /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
10407          requirement, insert one phi node for it.  It looks like:
10408            loop;
10409          BB:
10410            # lhs' = PHI <lhs>
10411          ==>
10412            loop;
10413          BB:
10414            # vec_lhs' = PHI <vec_lhs>
10415            new_tree = lane_extract <vec_lhs', ...>;
10416            lhs' = new_tree;  */
10417
10418       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10419       basic_block exit_bb = single_exit (loop)->dest;
10420       gcc_assert (single_pred_p (exit_bb));
10421
10422       tree vec_lhs_phi = copy_ssa_name (vec_lhs);
10423       gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
10424       SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
10425
10426       gimple_seq stmts = NULL;
10427       tree new_tree;
10428       if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
10429         {
10430           /* Emit:
10431
10432                SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
10433
10434              where VEC_LHS is the vectorized live-out result and MASK is
10435              the loop mask for the final iteration.  */
10436           gcc_assert (ncopies == 1 && !slp_node);
10437           gimple_seq tem = NULL;
10438           gimple_stmt_iterator gsi = gsi_last (tem);
10439           tree len
10440             = vect_get_loop_len (loop_vinfo, &gsi,
10441                                  &LOOP_VINFO_LENS (loop_vinfo),
10442                                  1, vectype, 0, 0);
10443
10444           /* BIAS - 1.  */
10445           signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
10446           tree bias_minus_one
10447             = int_const_binop (MINUS_EXPR,
10448                                build_int_cst (TREE_TYPE (len), biasval),
10449                                build_one_cst (TREE_TYPE (len)));
10450
10451           /* LAST_INDEX = LEN + (BIAS - 1).  */
10452           tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
10453                                           len, bias_minus_one);
10454
10455           /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>.  */
10456           tree scalar_res
10457             = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
10458                             vec_lhs_phi, last_index);
10459
10460           /* Convert the extracted vector element to the scalar type.  */
10461           new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10462         }
10463       else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
10464         {
10465           /* Emit:
10466
10467                SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
10468
10469              where VEC_LHS is the vectorized live-out result and MASK is
10470              the loop mask for the final iteration.  */
10471           gcc_assert (ncopies == 1 && !slp_node);
10472           tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
10473           gimple_seq tem = NULL;
10474           gimple_stmt_iterator gsi = gsi_last (tem);
10475           tree mask = vect_get_loop_mask (loop_vinfo, &gsi,
10476                                           &LOOP_VINFO_MASKS (loop_vinfo),
10477                                           1, vectype, 0);
10478           gimple_seq_add_seq (&stmts, tem);
10479           tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
10480                                           mask, vec_lhs_phi);
10481
10482           /* Convert the extracted vector element to the scalar type.  */
10483           new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10484         }
10485       else
10486         {
10487           tree bftype = TREE_TYPE (vectype);
10488           if (VECTOR_BOOLEAN_TYPE_P (vectype))
10489             bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10490           new_tree = build3 (BIT_FIELD_REF, bftype,
10491                              vec_lhs_phi, bitsize, bitstart);
10492           new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10493                                            &stmts, true, NULL_TREE);
10494         }
10495
10496       if (stmts)
10497         {
10498           gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
10499           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
10500
10501           /* Remove existing phi from lhs and create one copy from new_tree.  */
10502           tree lhs_phi = NULL_TREE;
10503           gimple_stmt_iterator gsi;
10504           for (gsi = gsi_start_phis (exit_bb);
10505                !gsi_end_p (gsi); gsi_next (&gsi))
10506             {
10507               gimple *phi = gsi_stmt (gsi);
10508               if ((gimple_phi_arg_def (phi, 0) == lhs))
10509                 {
10510                   remove_phi_node (&gsi, false);
10511                   lhs_phi = gimple_phi_result (phi);
10512                   gimple *copy = gimple_build_assign (lhs_phi, new_tree);
10513                   gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
10514                   break;
10515                 }
10516             }
10517         }
10518
10519       /* Replace use of lhs with newly computed result.  If the use stmt is a
10520          single arg PHI, just replace all uses of PHI result.  It's necessary
10521          because lcssa PHI defining lhs may be before newly inserted stmt.  */
10522       use_operand_p use_p;
10523       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10524         if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
10525             && !is_gimple_debug (use_stmt))
10526           {
10527             if (gimple_code (use_stmt) == GIMPLE_PHI
10528                 && gimple_phi_num_args (use_stmt) == 1)
10529               {
10530                 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
10531               }
10532             else
10533               {
10534                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10535                     SET_USE (use_p, new_tree);
10536               }
10537             update_stmt (use_stmt);
10538           }
10539     }
10540   else
10541     {
10542       /* For basic-block vectorization simply insert the lane-extraction.  */
10543       tree bftype = TREE_TYPE (vectype);
10544       if (VECTOR_BOOLEAN_TYPE_P (vectype))
10545         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10546       tree new_tree = build3 (BIT_FIELD_REF, bftype,
10547                               vec_lhs, bitsize, bitstart);
10548       gimple_seq stmts = NULL;
10549       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10550                                        &stmts, true, NULL_TREE);
10551       if (TREE_CODE (new_tree) == SSA_NAME
10552           && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
10553         SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
10554       if (is_a <gphi *> (vec_stmt))
10555         {
10556           gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
10557           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10558         }
10559       else
10560         {
10561           gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
10562           gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
10563         }
10564
10565       /* Replace use of lhs with newly computed result.  If the use stmt is a
10566          single arg PHI, just replace all uses of PHI result.  It's necessary
10567          because lcssa PHI defining lhs may be before newly inserted stmt.  */
10568       use_operand_p use_p;
10569       stmt_vec_info use_stmt_info;
10570       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10571         if (!is_gimple_debug (use_stmt)
10572             && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
10573                 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
10574           {
10575             /* ???  This can happen when the live lane ends up being
10576                used in a vector construction code-generated by an
10577                external SLP node (and code-generation for that already
10578                happened).  See gcc.dg/vect/bb-slp-47.c.
10579                Doing this is what would happen if that vector CTOR
10580                were not code-generated yet so it is not too bad.
10581                ???  In fact we'd likely want to avoid this situation
10582                in the first place.  */
10583             if (TREE_CODE (new_tree) == SSA_NAME
10584                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10585                 && gimple_code (use_stmt) != GIMPLE_PHI
10586                 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
10587                                                 use_stmt))
10588               {
10589                 enum tree_code code = gimple_assign_rhs_code (use_stmt);
10590                 gcc_checking_assert (code == SSA_NAME
10591                                      || code == CONSTRUCTOR
10592                                      || code == VIEW_CONVERT_EXPR
10593                                      || CONVERT_EXPR_CODE_P (code));
10594                 if (dump_enabled_p ())
10595                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10596                                    "Using original scalar computation for "
10597                                    "live lane because use preceeds vector "
10598                                    "def\n");
10599                 continue;
10600               }
10601             /* ???  It can also happen that we end up pulling a def into
10602                a loop where replacing out-of-loop uses would require
10603                a new LC SSA PHI node.  Retain the original scalar in
10604                those cases as well.  PR98064.  */
10605             if (TREE_CODE (new_tree) == SSA_NAME
10606                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10607                 && (gimple_bb (use_stmt)->loop_father
10608                     != gimple_bb (vec_stmt)->loop_father)
10609                 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
10610                                         gimple_bb (use_stmt)->loop_father))
10611               {
10612                 if (dump_enabled_p ())
10613                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10614                                    "Using original scalar computation for "
10615                                    "live lane because there is an out-of-loop "
10616                                    "definition for it\n");
10617                 continue;
10618               }
10619             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10620               SET_USE (use_p, new_tree);
10621             update_stmt (use_stmt);
10622           }
10623     }
10624
10625   return true;
10626 }
10627
10628 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
10629
10630 static void
10631 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
10632 {
10633   ssa_op_iter op_iter;
10634   imm_use_iterator imm_iter;
10635   def_operand_p def_p;
10636   gimple *ustmt;
10637
10638   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
10639     {
10640       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
10641         {
10642           basic_block bb;
10643
10644           if (!is_gimple_debug (ustmt))
10645             continue;
10646
10647           bb = gimple_bb (ustmt);
10648
10649           if (!flow_bb_inside_loop_p (loop, bb))
10650             {
10651               if (gimple_debug_bind_p (ustmt))
10652                 {
10653                   if (dump_enabled_p ())
10654                     dump_printf_loc (MSG_NOTE, vect_location,
10655                                      "killing debug use\n");
10656
10657                   gimple_debug_bind_reset_value (ustmt);
10658                   update_stmt (ustmt);
10659                 }
10660               else
10661                 gcc_unreachable ();
10662             }
10663         }
10664     }
10665 }
10666
10667 /* Given loop represented by LOOP_VINFO, return true if computation of
10668    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
10669    otherwise.  */
10670
10671 static bool
10672 loop_niters_no_overflow (loop_vec_info loop_vinfo)
10673 {
10674   /* Constant case.  */
10675   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
10676     {
10677       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
10678       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
10679
10680       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
10681       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
10682       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
10683         return true;
10684     }
10685
10686   widest_int max;
10687   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10688   /* Check the upper bound of loop niters.  */
10689   if (get_max_loop_iterations (loop, &max))
10690     {
10691       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
10692       signop sgn = TYPE_SIGN (type);
10693       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
10694       if (max < type_max)
10695         return true;
10696     }
10697   return false;
10698 }
10699
10700 /* Return a mask type with half the number of elements as OLD_TYPE,
10701    given that it should have mode NEW_MODE.  */
10702
10703 tree
10704 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
10705 {
10706   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
10707   return build_truth_vector_type_for_mode (nunits, new_mode);
10708 }
10709
10710 /* Return a mask type with twice as many elements as OLD_TYPE,
10711    given that it should have mode NEW_MODE.  */
10712
10713 tree
10714 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
10715 {
10716   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
10717   return build_truth_vector_type_for_mode (nunits, new_mode);
10718 }
10719
10720 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
10721    contain a sequence of NVECTORS masks that each control a vector of type
10722    VECTYPE.  If SCALAR_MASK is nonnull, the fully-masked loop would AND
10723    these vector masks with the vector version of SCALAR_MASK.  */
10724
10725 void
10726 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
10727                        unsigned int nvectors, tree vectype, tree scalar_mask)
10728 {
10729   gcc_assert (nvectors != 0);
10730
10731   if (scalar_mask)
10732     {
10733       scalar_cond_masked_key cond (scalar_mask, nvectors);
10734       loop_vinfo->scalar_cond_masked_set.add (cond);
10735     }
10736
10737   masks->mask_set.add (std::make_pair (vectype, nvectors));
10738 }
10739
10740 /* Given a complete set of masks MASKS, extract mask number INDEX
10741    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
10742    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
10743
10744    See the comment above vec_loop_masks for more details about the mask
10745    arrangement.  */
10746
10747 tree
10748 vect_get_loop_mask (loop_vec_info loop_vinfo,
10749                     gimple_stmt_iterator *gsi, vec_loop_masks *masks,
10750                     unsigned int nvectors, tree vectype, unsigned int index)
10751 {
10752   if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
10753       == vect_partial_vectors_while_ult)
10754     {
10755       rgroup_controls *rgm = &(masks->rgc_vec)[nvectors - 1];
10756       tree mask_type = rgm->type;
10757
10758       /* Populate the rgroup's mask array, if this is the first time we've
10759          used it.  */
10760       if (rgm->controls.is_empty ())
10761         {
10762           rgm->controls.safe_grow_cleared (nvectors, true);
10763           for (unsigned int i = 0; i < nvectors; ++i)
10764             {
10765               tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
10766               /* Provide a dummy definition until the real one is available.  */
10767               SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
10768               rgm->controls[i] = mask;
10769             }
10770         }
10771
10772       tree mask = rgm->controls[index];
10773       if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
10774                     TYPE_VECTOR_SUBPARTS (vectype)))
10775         {
10776           /* A loop mask for data type X can be reused for data type Y
10777              if X has N times more elements than Y and if Y's elements
10778              are N times bigger than X's.  In this case each sequence
10779              of N elements in the loop mask will be all-zero or all-one.
10780              We can then view-convert the mask so that each sequence of
10781              N elements is replaced by a single element.  */
10782           gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
10783                                   TYPE_VECTOR_SUBPARTS (vectype)));
10784           gimple_seq seq = NULL;
10785           mask_type = truth_type_for (vectype);
10786           mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
10787           if (seq)
10788             gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10789         }
10790       return mask;
10791     }
10792   else if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
10793            == vect_partial_vectors_avx512)
10794     {
10795       /* The number of scalars per iteration and the number of vectors are
10796          both compile-time constants.  */
10797       unsigned int nscalars_per_iter
10798         = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10799                      LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
10800
10801       rgroup_controls *rgm = &masks->rgc_vec[nscalars_per_iter - 1];
10802
10803       /* The stored nV is dependent on the mask type produced.  */
10804       gcc_assert (exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10805                              TYPE_VECTOR_SUBPARTS (rgm->type)).to_constant ()
10806                   == rgm->factor);
10807       nvectors = rgm->factor;
10808
10809       /* Populate the rgroup's mask array, if this is the first time we've
10810          used it.  */
10811       if (rgm->controls.is_empty ())
10812         {
10813           rgm->controls.safe_grow_cleared (nvectors, true);
10814           for (unsigned int i = 0; i < nvectors; ++i)
10815             {
10816               tree mask = make_temp_ssa_name (rgm->type, NULL, "loop_mask");
10817               /* Provide a dummy definition until the real one is available.  */
10818               SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
10819               rgm->controls[i] = mask;
10820             }
10821         }
10822       if (known_eq (TYPE_VECTOR_SUBPARTS (rgm->type),
10823                     TYPE_VECTOR_SUBPARTS (vectype)))
10824         return rgm->controls[index];
10825
10826       /* Split the vector if needed.  Since we are dealing with integer mode
10827          masks with AVX512 we can operate on the integer representation
10828          performing the whole vector shifting.  */
10829       unsigned HOST_WIDE_INT factor;
10830       bool ok = constant_multiple_p (TYPE_VECTOR_SUBPARTS (rgm->type),
10831                                      TYPE_VECTOR_SUBPARTS (vectype), &factor);
10832       gcc_assert (ok);
10833       gcc_assert (GET_MODE_CLASS (TYPE_MODE (rgm->type)) == MODE_INT);
10834       tree mask_type = truth_type_for (vectype);
10835       gcc_assert (GET_MODE_CLASS (TYPE_MODE (mask_type)) == MODE_INT);
10836       unsigned vi = index / factor;
10837       unsigned vpart = index % factor;
10838       tree vec = rgm->controls[vi];
10839       gimple_seq seq = NULL;
10840       vec = gimple_build (&seq, VIEW_CONVERT_EXPR,
10841                           lang_hooks.types.type_for_mode
10842                                 (TYPE_MODE (rgm->type), 1), vec);
10843       /* For integer mode masks simply shift the right bits into position.  */
10844       if (vpart != 0)
10845         vec = gimple_build (&seq, RSHIFT_EXPR, TREE_TYPE (vec), vec,
10846                             build_int_cst (integer_type_node,
10847                                            (TYPE_VECTOR_SUBPARTS (vectype)
10848                                             * vpart)));
10849       vec = gimple_convert (&seq, lang_hooks.types.type_for_mode
10850                                     (TYPE_MODE (mask_type), 1), vec);
10851       vec = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, vec);
10852       if (seq)
10853         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10854       return vec;
10855     }
10856   else
10857     gcc_unreachable ();
10858 }
10859
10860 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
10861    lengths for controlling an operation on VECTYPE.  The operation splits
10862    each element of VECTYPE into FACTOR separate subelements, measuring the
10863    length as a number of these subelements.  */
10864
10865 void
10866 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
10867                       unsigned int nvectors, tree vectype, unsigned int factor)
10868 {
10869   gcc_assert (nvectors != 0);
10870   if (lens->length () < nvectors)
10871     lens->safe_grow_cleared (nvectors, true);
10872   rgroup_controls *rgl = &(*lens)[nvectors - 1];
10873
10874   /* The number of scalars per iteration, scalar occupied bytes and
10875      the number of vectors are both compile-time constants.  */
10876   unsigned int nscalars_per_iter
10877     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10878                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
10879
10880   if (rgl->max_nscalars_per_iter < nscalars_per_iter)
10881     {
10882       /* For now, we only support cases in which all loads and stores fall back
10883          to VnQI or none do.  */
10884       gcc_assert (!rgl->max_nscalars_per_iter
10885                   || (rgl->factor == 1 && factor == 1)
10886                   || (rgl->max_nscalars_per_iter * rgl->factor
10887                       == nscalars_per_iter * factor));
10888       rgl->max_nscalars_per_iter = nscalars_per_iter;
10889       rgl->type = vectype;
10890       rgl->factor = factor;
10891     }
10892 }
10893
10894 /* Given a complete set of lengths LENS, extract length number INDEX
10895    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
10896    where 0 <= INDEX < NVECTORS.  Return a value that contains FACTOR
10897    multipled by the number of elements that should be processed.
10898    Insert any set-up statements before GSI.  */
10899
10900 tree
10901 vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
10902                    vec_loop_lens *lens, unsigned int nvectors, tree vectype,
10903                    unsigned int index, unsigned int factor)
10904 {
10905   rgroup_controls *rgl = &(*lens)[nvectors - 1];
10906   bool use_bias_adjusted_len =
10907     LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
10908
10909   /* Populate the rgroup's len array, if this is the first time we've
10910      used it.  */
10911   if (rgl->controls.is_empty ())
10912     {
10913       rgl->controls.safe_grow_cleared (nvectors, true);
10914       for (unsigned int i = 0; i < nvectors; ++i)
10915         {
10916           tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
10917           gcc_assert (len_type != NULL_TREE);
10918
10919           tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
10920
10921           /* Provide a dummy definition until the real one is available.  */
10922           SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
10923           rgl->controls[i] = len;
10924
10925           if (use_bias_adjusted_len)
10926             {
10927               gcc_assert (i == 0);
10928               tree adjusted_len =
10929                 make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
10930               SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
10931               rgl->bias_adjusted_ctrl = adjusted_len;
10932             }
10933         }
10934     }
10935
10936   if (use_bias_adjusted_len)
10937     return rgl->bias_adjusted_ctrl;
10938
10939   tree loop_len = rgl->controls[index];
10940   if (rgl->factor == 1 && factor == 1)
10941     {
10942       poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
10943       poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
10944       if (maybe_ne (nunits1, nunits2))
10945         {
10946           /* A loop len for data type X can be reused for data type Y
10947              if X has N times more elements than Y and if Y's elements
10948              are N times bigger than X's.  */
10949           gcc_assert (multiple_p (nunits1, nunits2));
10950           factor = exact_div (nunits1, nunits2).to_constant ();
10951           tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
10952           gimple_seq seq = NULL;
10953           loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
10954                                    build_int_cst (iv_type, factor));
10955           if (seq)
10956             gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10957         }
10958     }
10959   return loop_len;
10960 }
10961
10962 /* Scale profiling counters by estimation for LOOP which is vectorized
10963    by factor VF.
10964    If FLAT is true, the loop we started with had unrealistically flat
10965    profile.  */
10966
10967 static void
10968 scale_profile_for_vect_loop (class loop *loop, unsigned vf, bool flat)
10969 {
10970   /* For flat profiles do not scale down proportionally by VF and only
10971      cap by known iteration count bounds.  */
10972   if (flat)
10973     {
10974       if (dump_file && (dump_flags & TDF_DETAILS))
10975         fprintf (dump_file,
10976                  "Vectorized loop profile seems flat; not scaling iteration "
10977                  "count down by the vectorization factor %i\n", vf);
10978       scale_loop_profile (loop, profile_probability::always (),
10979                           get_likely_max_loop_iterations_int (loop));
10980       return;
10981     }
10982   /* Loop body executes VF fewer times and exit increases VF times.  */
10983   edge exit_e = single_exit (loop);
10984   profile_count entry_count = loop_preheader_edge (loop)->count ();
10985
10986   /* If we have unreliable loop profile avoid dropping entry
10987      count bellow header count.  This can happen since loops
10988      has unrealistically low trip counts.  */
10989   while (vf > 1
10990          && loop->header->count > entry_count
10991          && loop->header->count < entry_count * vf)
10992     {
10993       if (dump_file && (dump_flags & TDF_DETAILS))
10994         fprintf (dump_file,
10995                  "Vectorization factor %i seems too large for profile "
10996                  "prevoiusly believed to be consistent; reducing.\n", vf);
10997       vf /= 2;
10998     }
10999
11000   if (entry_count.nonzero_p ())
11001     set_edge_probability_and_rescale_others
11002             (exit_e,
11003              entry_count.probability_in (loop->header->count / vf));
11004   /* Avoid producing very large exit probability when we do not have
11005      sensible profile.  */
11006   else if (exit_e->probability < profile_probability::always () / (vf * 2))
11007     set_edge_probability_and_rescale_others (exit_e, exit_e->probability * vf);
11008   loop->latch->count = single_pred_edge (loop->latch)->count ();
11009
11010   scale_loop_profile (loop, profile_probability::always () / vf,
11011                       get_likely_max_loop_iterations_int (loop));
11012 }
11013
11014 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
11015    latch edge values originally defined by it.  */
11016
11017 static void
11018 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
11019                                      stmt_vec_info def_stmt_info)
11020 {
11021   tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
11022   if (!def || TREE_CODE (def) != SSA_NAME)
11023     return;
11024   stmt_vec_info phi_info;
11025   imm_use_iterator iter;
11026   use_operand_p use_p;
11027   FOR_EACH_IMM_USE_FAST (use_p, iter, def)
11028     {
11029       gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p));
11030       if (!phi)
11031         continue;
11032       if (!(gimple_bb (phi)->loop_father->header == gimple_bb (phi)
11033             && (phi_info = loop_vinfo->lookup_stmt (phi))
11034             && STMT_VINFO_RELEVANT_P (phi_info)))
11035         continue;
11036       loop_p loop = gimple_bb (phi)->loop_father;
11037       edge e = loop_latch_edge (loop);
11038       if (PHI_ARG_DEF_FROM_EDGE (phi, e) != def)
11039         continue;
11040
11041       if (VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
11042           && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
11043           && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
11044         {
11045           vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11046           vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11047           gcc_assert (phi_defs.length () == latch_defs.length ());
11048           for (unsigned i = 0; i < phi_defs.length (); ++i)
11049             add_phi_arg (as_a <gphi *> (phi_defs[i]),
11050                          gimple_get_lhs (latch_defs[i]), e,
11051                          gimple_phi_arg_location (phi, e->dest_idx));
11052         }
11053       else if (STMT_VINFO_DEF_TYPE (phi_info) == vect_first_order_recurrence)
11054         {
11055           /* For first order recurrences we have to update both uses of
11056              the latch definition, the one in the PHI node and the one
11057              in the generated VEC_PERM_EXPR.  */
11058           vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11059           vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11060           gcc_assert (phi_defs.length () == latch_defs.length ());
11061           tree phidef = gimple_assign_rhs1 (phi_defs[0]);
11062           gphi *vphi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
11063           for (unsigned i = 0; i < phi_defs.length (); ++i)
11064             {
11065               gassign *perm = as_a <gassign *> (phi_defs[i]);
11066               if (i > 0)
11067                 gimple_assign_set_rhs1 (perm, gimple_get_lhs (latch_defs[i-1]));
11068               gimple_assign_set_rhs2 (perm, gimple_get_lhs (latch_defs[i]));
11069               update_stmt (perm);
11070             }
11071           add_phi_arg (vphi, gimple_get_lhs (latch_defs.last ()), e,
11072                        gimple_phi_arg_location (phi, e->dest_idx));
11073         }
11074     }
11075 }
11076
11077 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
11078    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
11079    stmt_vec_info.  */
11080
11081 static bool
11082 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
11083                           gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
11084 {
11085   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11086   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11087
11088   if (dump_enabled_p ())
11089     dump_printf_loc (MSG_NOTE, vect_location,
11090                      "------>vectorizing statement: %G", stmt_info->stmt);
11091
11092   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
11093     vect_loop_kill_debug_uses (loop, stmt_info);
11094
11095   if (!STMT_VINFO_RELEVANT_P (stmt_info)
11096       && !STMT_VINFO_LIVE_P (stmt_info))
11097     return false;
11098
11099   if (STMT_VINFO_VECTYPE (stmt_info))
11100     {
11101       poly_uint64 nunits
11102         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
11103       if (!STMT_SLP_TYPE (stmt_info)
11104           && maybe_ne (nunits, vf)
11105           && dump_enabled_p ())
11106         /* For SLP VF is set according to unrolling factor, and not
11107            to vector size, hence for SLP this print is not valid.  */
11108         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
11109     }
11110
11111   /* Pure SLP statements have already been vectorized.  We still need
11112      to apply loop vectorization to hybrid SLP statements.  */
11113   if (PURE_SLP_STMT (stmt_info))
11114     return false;
11115
11116   if (dump_enabled_p ())
11117     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
11118
11119   if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
11120     *seen_store = stmt_info;
11121
11122   return true;
11123 }
11124
11125 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
11126    in the hash_map with its corresponding values.  */
11127
11128 static tree
11129 find_in_mapping (tree t, void *context)
11130 {
11131   hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
11132
11133   tree *value = mapping->get (t);
11134   return value ? *value : t;
11135 }
11136
11137 /* Update EPILOGUE's loop_vec_info.  EPILOGUE was constructed as a copy of the
11138    original loop that has now been vectorized.
11139
11140    The inits of the data_references need to be advanced with the number of
11141    iterations of the main loop.  This has been computed in vect_do_peeling and
11142    is stored in parameter ADVANCE.  We first restore the data_references
11143    initial offset with the values recored in ORIG_DRS_INIT.
11144
11145    Since the loop_vec_info of this EPILOGUE was constructed for the original
11146    loop, its stmt_vec_infos all point to the original statements.  These need
11147    to be updated to point to their corresponding copies as well as the SSA_NAMES
11148    in their PATTERN_DEF_SEQs and RELATED_STMTs.
11149
11150    The data_reference's connections also need to be updated.  Their
11151    corresponding dr_vec_info need to be reconnected to the EPILOGUE's
11152    stmt_vec_infos, their statements need to point to their corresponding copy,
11153    if they are gather loads or scatter stores then their reference needs to be
11154    updated to point to its corresponding copy and finally we set
11155    'base_misaligned' to false as we have already peeled for alignment in the
11156    prologue of the main loop.  */
11157
11158 static void
11159 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
11160 {
11161   loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
11162   auto_vec<gimple *> stmt_worklist;
11163   hash_map<tree,tree> mapping;
11164   gimple *orig_stmt, *new_stmt;
11165   gimple_stmt_iterator epilogue_gsi;
11166   gphi_iterator epilogue_phi_gsi;
11167   stmt_vec_info stmt_vinfo = NULL, related_vinfo;
11168   basic_block *epilogue_bbs = get_loop_body (epilogue);
11169   unsigned i;
11170
11171   free (LOOP_VINFO_BBS (epilogue_vinfo));
11172   LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
11173
11174   /* Advance data_reference's with the number of iterations of the previous
11175      loop and its prologue.  */
11176   vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
11177
11178
11179   /* The EPILOGUE loop is a copy of the original loop so they share the same
11180      gimple UIDs.  In this loop we update the loop_vec_info of the EPILOGUE to
11181      point to the copied statements.  We also create a mapping of all LHS' in
11182      the original loop and all the LHS' in the EPILOGUE and create worklists to
11183      update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs.  */
11184   for (unsigned i = 0; i < epilogue->num_nodes; ++i)
11185     {
11186       for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
11187            !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
11188         {
11189           new_stmt = epilogue_phi_gsi.phi ();
11190
11191           gcc_assert (gimple_uid (new_stmt) > 0);
11192           stmt_vinfo
11193             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11194
11195           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11196           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11197
11198           mapping.put (gimple_phi_result (orig_stmt),
11199                        gimple_phi_result (new_stmt));
11200           /* PHI nodes can not have patterns or related statements.  */
11201           gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
11202                       && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
11203         }
11204
11205       for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
11206            !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
11207         {
11208           new_stmt = gsi_stmt (epilogue_gsi);
11209           if (is_gimple_debug (new_stmt))
11210             continue;
11211
11212           gcc_assert (gimple_uid (new_stmt) > 0);
11213           stmt_vinfo
11214             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11215
11216           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11217           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11218
11219           if (tree old_lhs = gimple_get_lhs (orig_stmt))
11220             mapping.put (old_lhs, gimple_get_lhs (new_stmt));
11221
11222           if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
11223             {
11224               gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
11225               for (gimple_stmt_iterator gsi = gsi_start (seq);
11226                    !gsi_end_p (gsi); gsi_next (&gsi))
11227                 stmt_worklist.safe_push (gsi_stmt (gsi));
11228             }
11229
11230           related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
11231           if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
11232             {
11233               gimple *stmt = STMT_VINFO_STMT (related_vinfo);
11234               stmt_worklist.safe_push (stmt);
11235               /* Set BB such that the assert in
11236                 'get_initial_def_for_reduction' is able to determine that
11237                 the BB of the related stmt is inside this loop.  */
11238               gimple_set_bb (stmt,
11239                              gimple_bb (new_stmt));
11240               related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
11241               gcc_assert (related_vinfo == NULL
11242                           || related_vinfo == stmt_vinfo);
11243             }
11244         }
11245     }
11246
11247   /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
11248      using the original main loop and thus need to be updated to refer to the
11249      cloned variables used in the epilogue.  */
11250   for (unsigned i = 0; i < stmt_worklist.length (); ++i)
11251     {
11252       gimple *stmt = stmt_worklist[i];
11253       tree *new_op;
11254
11255       for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
11256         {
11257           tree op = gimple_op (stmt, j);
11258           if ((new_op = mapping.get(op)))
11259             gimple_set_op (stmt, j, *new_op);
11260           else
11261             {
11262               /* PR92429: The last argument of simplify_replace_tree disables
11263                  folding when replacing arguments.  This is required as
11264                  otherwise you might end up with different statements than the
11265                  ones analyzed in vect_loop_analyze, leading to different
11266                  vectorization.  */
11267               op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
11268                                           &find_in_mapping, &mapping, false);
11269               gimple_set_op (stmt, j, op);
11270             }
11271         }
11272     }
11273
11274   struct data_reference *dr;
11275   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
11276   FOR_EACH_VEC_ELT (datarefs, i, dr)
11277     {
11278       orig_stmt = DR_STMT (dr);
11279       gcc_assert (gimple_uid (orig_stmt) > 0);
11280       stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
11281       /* Data references for gather loads and scatter stores do not use the
11282          updated offset we set using ADVANCE.  Instead we have to make sure the
11283          reference in the data references point to the corresponding copy of
11284          the original in the epilogue.  */
11285       if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
11286           == VMAT_GATHER_SCATTER)
11287         {
11288           DR_REF (dr)
11289             = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
11290                                      &find_in_mapping, &mapping);
11291           DR_BASE_ADDRESS (dr)
11292             = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
11293                                      &find_in_mapping, &mapping);
11294         }
11295       DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
11296       stmt_vinfo->dr_aux.stmt = stmt_vinfo;
11297       /* The vector size of the epilogue is smaller than that of the main loop
11298          so the alignment is either the same or lower. This means the dr will
11299          thus by definition be aligned.  */
11300       STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
11301     }
11302
11303   epilogue_vinfo->shared->datarefs_copy.release ();
11304   epilogue_vinfo->shared->save_datarefs ();
11305 }
11306
11307 /* Function vect_transform_loop.
11308
11309    The analysis phase has determined that the loop is vectorizable.
11310    Vectorize the loop - created vectorized stmts to replace the scalar
11311    stmts in the loop, and update the loop exit condition.
11312    Returns scalar epilogue loop if any.  */
11313
11314 class loop *
11315 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
11316 {
11317   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11318   class loop *epilogue = NULL;
11319   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
11320   int nbbs = loop->num_nodes;
11321   int i;
11322   tree niters_vector = NULL_TREE;
11323   tree step_vector = NULL_TREE;
11324   tree niters_vector_mult_vf = NULL_TREE;
11325   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11326   unsigned int lowest_vf = constant_lower_bound (vf);
11327   gimple *stmt;
11328   bool check_profitability = false;
11329   unsigned int th;
11330   bool flat = maybe_flat_loop_profile (loop);
11331
11332   DUMP_VECT_SCOPE ("vec_transform_loop");
11333
11334   loop_vinfo->shared->check_datarefs ();
11335
11336   /* Use the more conservative vectorization threshold.  If the number
11337      of iterations is constant assume the cost check has been performed
11338      by our caller.  If the threshold makes all loops profitable that
11339      run at least the (estimated) vectorization factor number of times
11340      checking is pointless, too.  */
11341   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
11342   if (vect_apply_runtime_profitability_check_p (loop_vinfo))
11343     {
11344       if (dump_enabled_p ())
11345         dump_printf_loc (MSG_NOTE, vect_location,
11346                          "Profitability threshold is %d loop iterations.\n",
11347                          th);
11348       check_profitability = true;
11349     }
11350
11351   /* Make sure there exists a single-predecessor exit bb.  Do this before
11352      versioning.   */
11353   edge e = single_exit (loop);
11354   if (! single_pred_p (e->dest))
11355     {
11356       split_loop_exit_edge (e, true);
11357       if (dump_enabled_p ())
11358         dump_printf (MSG_NOTE, "split exit edge\n");
11359     }
11360
11361   /* Version the loop first, if required, so the profitability check
11362      comes first.  */
11363
11364   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
11365     {
11366       class loop *sloop
11367         = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
11368       sloop->force_vectorize = false;
11369       check_profitability = false;
11370     }
11371
11372   /* Make sure there exists a single-predecessor exit bb also on the
11373      scalar loop copy.  Do this after versioning but before peeling
11374      so CFG structure is fine for both scalar and if-converted loop
11375      to make slpeel_duplicate_current_defs_from_edges face matched
11376      loop closed PHI nodes on the exit.  */
11377   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
11378     {
11379       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
11380       if (! single_pred_p (e->dest))
11381         {
11382           split_loop_exit_edge (e, true);
11383           if (dump_enabled_p ())
11384             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
11385         }
11386     }
11387
11388   tree niters = vect_build_loop_niters (loop_vinfo);
11389   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
11390   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
11391   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
11392   tree advance;
11393   drs_init_vec orig_drs_init;
11394
11395   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
11396                               &step_vector, &niters_vector_mult_vf, th,
11397                               check_profitability, niters_no_overflow,
11398                               &advance);
11399   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
11400       && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
11401     {
11402       /* Ifcvt duplicates loop preheader, loop body and produces an basic
11403          block after loop exit.  We need to scale all that.  */
11404       basic_block preheader
11405         = loop_preheader_edge (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->src;
11406       preheader->count
11407         = preheader->count.apply_probability
11408               (LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11409       scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
11410                               LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11411       single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->dest->count
11412         = preheader->count;
11413     }
11414
11415   if (niters_vector == NULL_TREE)
11416     {
11417       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
11418           && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
11419           && known_eq (lowest_vf, vf))
11420         {
11421           niters_vector
11422             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
11423                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
11424           step_vector = build_one_cst (TREE_TYPE (niters));
11425         }
11426       else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11427         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
11428                                      &step_vector, niters_no_overflow);
11429       else
11430         /* vect_do_peeling subtracted the number of peeled prologue
11431            iterations from LOOP_VINFO_NITERS.  */
11432         vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
11433                                      &niters_vector, &step_vector,
11434                                      niters_no_overflow);
11435     }
11436
11437   /* 1) Make sure the loop header has exactly two entries
11438      2) Make sure we have a preheader basic block.  */
11439
11440   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
11441
11442   split_edge (loop_preheader_edge (loop));
11443
11444   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11445     /* This will deal with any possible peeling.  */
11446     vect_prepare_for_masked_peels (loop_vinfo);
11447
11448   /* Schedule the SLP instances first, then handle loop vectorization
11449      below.  */
11450   if (!loop_vinfo->slp_instances.is_empty ())
11451     {
11452       DUMP_VECT_SCOPE ("scheduling SLP instances");
11453       vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
11454     }
11455
11456   /* FORNOW: the vectorizer supports only loops which body consist
11457      of one basic block (header + empty latch). When the vectorizer will
11458      support more involved loop forms, the order by which the BBs are
11459      traversed need to be reconsidered.  */
11460
11461   for (i = 0; i < nbbs; i++)
11462     {
11463       basic_block bb = bbs[i];
11464       stmt_vec_info stmt_info;
11465
11466       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
11467            gsi_next (&si))
11468         {
11469           gphi *phi = si.phi ();
11470           if (dump_enabled_p ())
11471             dump_printf_loc (MSG_NOTE, vect_location,
11472                              "------>vectorizing phi: %G", (gimple *) phi);
11473           stmt_info = loop_vinfo->lookup_stmt (phi);
11474           if (!stmt_info)
11475             continue;
11476
11477           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
11478             vect_loop_kill_debug_uses (loop, stmt_info);
11479
11480           if (!STMT_VINFO_RELEVANT_P (stmt_info)
11481               && !STMT_VINFO_LIVE_P (stmt_info))
11482             continue;
11483
11484           if (STMT_VINFO_VECTYPE (stmt_info)
11485               && (maybe_ne
11486                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
11487               && dump_enabled_p ())
11488             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
11489
11490           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
11491                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
11492                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
11493                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
11494                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence
11495                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
11496               && ! PURE_SLP_STMT (stmt_info))
11497             {
11498               if (dump_enabled_p ())
11499                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
11500               vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
11501             }
11502         }
11503
11504       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
11505            gsi_next (&si))
11506         {
11507           gphi *phi = si.phi ();
11508           stmt_info = loop_vinfo->lookup_stmt (phi);
11509           if (!stmt_info)
11510             continue;
11511
11512           if (!STMT_VINFO_RELEVANT_P (stmt_info)
11513               && !STMT_VINFO_LIVE_P (stmt_info))
11514             continue;
11515
11516           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
11517                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
11518                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
11519                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
11520                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
11521                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence)
11522               && ! PURE_SLP_STMT (stmt_info))
11523             maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
11524         }
11525
11526       for (gimple_stmt_iterator si = gsi_start_bb (bb);
11527            !gsi_end_p (si);)
11528         {
11529           stmt = gsi_stmt (si);
11530           /* During vectorization remove existing clobber stmts.  */
11531           if (gimple_clobber_p (stmt))
11532             {
11533               unlink_stmt_vdef (stmt);
11534               gsi_remove (&si, true);
11535               release_defs (stmt);
11536             }
11537           else
11538             {
11539               /* Ignore vector stmts created in the outer loop.  */
11540               stmt_info = loop_vinfo->lookup_stmt (stmt);
11541
11542               /* vector stmts created in the outer-loop during vectorization of
11543                  stmts in an inner-loop may not have a stmt_info, and do not
11544                  need to be vectorized.  */
11545               stmt_vec_info seen_store = NULL;
11546               if (stmt_info)
11547                 {
11548                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
11549                     {
11550                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
11551                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
11552                            !gsi_end_p (subsi); gsi_next (&subsi))
11553                         {
11554                           stmt_vec_info pat_stmt_info
11555                             = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
11556                           vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
11557                                                     &si, &seen_store);
11558                         }
11559                       stmt_vec_info pat_stmt_info
11560                         = STMT_VINFO_RELATED_STMT (stmt_info);
11561                       if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
11562                                                     &si, &seen_store))
11563                         maybe_set_vectorized_backedge_value (loop_vinfo,
11564                                                              pat_stmt_info);
11565                     }
11566                   else
11567                     {
11568                       if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
11569                                                     &seen_store))
11570                         maybe_set_vectorized_backedge_value (loop_vinfo,
11571                                                              stmt_info);
11572                     }
11573                 }
11574               gsi_next (&si);
11575               if (seen_store)
11576                 {
11577                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
11578                     /* Interleaving.  If IS_STORE is TRUE, the
11579                        vectorization of the interleaving chain was
11580                        completed - free all the stores in the chain.  */
11581                     vect_remove_stores (loop_vinfo,
11582                                         DR_GROUP_FIRST_ELEMENT (seen_store));
11583                   else
11584                     /* Free the attached stmt_vec_info and remove the stmt.  */
11585                     loop_vinfo->remove_stmt (stmt_info);
11586                 }
11587             }
11588         }
11589
11590       /* Stub out scalar statements that must not survive vectorization.
11591          Doing this here helps with grouped statements, or statements that
11592          are involved in patterns.  */
11593       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
11594            !gsi_end_p (gsi); gsi_next (&gsi))
11595         {
11596           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
11597           if (!call || !gimple_call_internal_p (call))
11598             continue;
11599           internal_fn ifn = gimple_call_internal_fn (call);
11600           if (ifn == IFN_MASK_LOAD)
11601             {
11602               tree lhs = gimple_get_lhs (call);
11603               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11604                 {
11605                   tree zero = build_zero_cst (TREE_TYPE (lhs));
11606                   gimple *new_stmt = gimple_build_assign (lhs, zero);
11607                   gsi_replace (&gsi, new_stmt, true);
11608                 }
11609             }
11610           else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
11611             {
11612               tree lhs = gimple_get_lhs (call);
11613               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11614                 {
11615                   tree else_arg
11616                     = gimple_call_arg (call, gimple_call_num_args (call) - 1);
11617                   gimple *new_stmt = gimple_build_assign (lhs, else_arg);
11618                   gsi_replace (&gsi, new_stmt, true);
11619                 }
11620             }
11621         }
11622     }                           /* BBs in loop */
11623
11624   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
11625      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
11626   if (integer_onep (step_vector))
11627     niters_no_overflow = true;
11628   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
11629                            niters_vector_mult_vf, !niters_no_overflow);
11630
11631   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
11632
11633   /* True if the final iteration might not handle a full vector's
11634      worth of scalar iterations.  */
11635   bool final_iter_may_be_partial
11636     = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
11637   /* The minimum number of iterations performed by the epilogue.  This
11638      is 1 when peeling for gaps because we always need a final scalar
11639      iteration.  */
11640   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
11641   /* +1 to convert latch counts to loop iteration counts,
11642      -min_epilogue_iters to remove iterations that cannot be performed
11643        by the vector code.  */
11644   int bias_for_lowest = 1 - min_epilogue_iters;
11645   int bias_for_assumed = bias_for_lowest;
11646   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
11647   if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
11648     {
11649       /* When the amount of peeling is known at compile time, the first
11650          iteration will have exactly alignment_npeels active elements.
11651          In the worst case it will have at least one.  */
11652       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
11653       bias_for_lowest += lowest_vf - min_first_active;
11654       bias_for_assumed += assumed_vf - min_first_active;
11655     }
11656   /* In these calculations the "- 1" converts loop iteration counts
11657      back to latch counts.  */
11658   if (loop->any_upper_bound)
11659     {
11660       loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
11661       loop->nb_iterations_upper_bound
11662         = (final_iter_may_be_partial
11663            ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
11664                             lowest_vf) - 1
11665            : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
11666                              lowest_vf) - 1);
11667       if (main_vinfo
11668           /* Both peeling for alignment and peeling for gaps can end up
11669              with the scalar epilogue running for more than VF-1 iterations.  */
11670           && !main_vinfo->peeling_for_alignment
11671           && !main_vinfo->peeling_for_gaps)
11672         {
11673           unsigned int bound;
11674           poly_uint64 main_iters
11675             = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
11676                            LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
11677           main_iters
11678             = upper_bound (main_iters,
11679                            LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
11680           if (can_div_away_from_zero_p (main_iters,
11681                                         LOOP_VINFO_VECT_FACTOR (loop_vinfo),
11682                                         &bound))
11683             loop->nb_iterations_upper_bound
11684               = wi::umin ((widest_int) (bound - 1),
11685                           loop->nb_iterations_upper_bound);
11686       }
11687   }
11688   if (loop->any_likely_upper_bound)
11689     loop->nb_iterations_likely_upper_bound
11690       = (final_iter_may_be_partial
11691          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
11692                           + bias_for_lowest, lowest_vf) - 1
11693          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
11694                            + bias_for_lowest, lowest_vf) - 1);
11695   if (loop->any_estimate)
11696     loop->nb_iterations_estimate
11697       = (final_iter_may_be_partial
11698          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
11699                           assumed_vf) - 1
11700          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
11701                            assumed_vf) - 1);
11702   scale_profile_for_vect_loop (loop, assumed_vf, flat);
11703
11704   if (dump_enabled_p ())
11705     {
11706       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
11707         {
11708           dump_printf_loc (MSG_NOTE, vect_location,
11709                            "LOOP VECTORIZED\n");
11710           if (loop->inner)
11711             dump_printf_loc (MSG_NOTE, vect_location,
11712                              "OUTER LOOP VECTORIZED\n");
11713           dump_printf (MSG_NOTE, "\n");
11714         }
11715       else
11716         dump_printf_loc (MSG_NOTE, vect_location,
11717                          "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
11718                          GET_MODE_NAME (loop_vinfo->vector_mode));
11719     }
11720
11721   /* Loops vectorized with a variable factor won't benefit from
11722      unrolling/peeling.  */
11723   if (!vf.is_constant ())
11724     {
11725       loop->unroll = 1;
11726       if (dump_enabled_p ())
11727         dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
11728                          " variable-length vectorization factor\n");
11729     }
11730   /* Free SLP instances here because otherwise stmt reference counting
11731      won't work.  */
11732   slp_instance instance;
11733   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
11734     vect_free_slp_instance (instance);
11735   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
11736   /* Clear-up safelen field since its value is invalid after vectorization
11737      since vectorized loop can have loop-carried dependencies.  */
11738   loop->safelen = 0;
11739
11740   if (epilogue)
11741     {
11742       update_epilogue_loop_vinfo (epilogue, advance);
11743
11744       epilogue->simduid = loop->simduid;
11745       epilogue->force_vectorize = loop->force_vectorize;
11746       epilogue->dont_vectorize = false;
11747     }
11748
11749   return epilogue;
11750 }
11751
11752 /* The code below is trying to perform simple optimization - revert
11753    if-conversion for masked stores, i.e. if the mask of a store is zero
11754    do not perform it and all stored value producers also if possible.
11755    For example,
11756      for (i=0; i<n; i++)
11757        if (c[i])
11758         {
11759           p1[i] += 1;
11760           p2[i] = p3[i] +2;
11761         }
11762    this transformation will produce the following semi-hammock:
11763
11764    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
11765      {
11766        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
11767        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
11768        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
11769        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
11770        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
11771        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
11772      }
11773 */
11774
11775 void
11776 optimize_mask_stores (class loop *loop)
11777 {
11778   basic_block *bbs = get_loop_body (loop);
11779   unsigned nbbs = loop->num_nodes;
11780   unsigned i;
11781   basic_block bb;
11782   class loop *bb_loop;
11783   gimple_stmt_iterator gsi;
11784   gimple *stmt;
11785   auto_vec<gimple *> worklist;
11786   auto_purge_vect_location sentinel;
11787
11788   vect_location = find_loop_location (loop);
11789   /* Pick up all masked stores in loop if any.  */
11790   for (i = 0; i < nbbs; i++)
11791     {
11792       bb = bbs[i];
11793       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
11794            gsi_next (&gsi))
11795         {
11796           stmt = gsi_stmt (gsi);
11797           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
11798             worklist.safe_push (stmt);
11799         }
11800     }
11801
11802   free (bbs);
11803   if (worklist.is_empty ())
11804     return;
11805
11806   /* Loop has masked stores.  */
11807   while (!worklist.is_empty ())
11808     {
11809       gimple *last, *last_store;
11810       edge e, efalse;
11811       tree mask;
11812       basic_block store_bb, join_bb;
11813       gimple_stmt_iterator gsi_to;
11814       tree vdef, new_vdef;
11815       gphi *phi;
11816       tree vectype;
11817       tree zero;
11818
11819       last = worklist.pop ();
11820       mask = gimple_call_arg (last, 2);
11821       bb = gimple_bb (last);
11822       /* Create then_bb and if-then structure in CFG, then_bb belongs to
11823          the same loop as if_bb.  It could be different to LOOP when two
11824          level loop-nest is vectorized and mask_store belongs to the inner
11825          one.  */
11826       e = split_block (bb, last);
11827       bb_loop = bb->loop_father;
11828       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
11829       join_bb = e->dest;
11830       store_bb = create_empty_bb (bb);
11831       add_bb_to_loop (store_bb, bb_loop);
11832       e->flags = EDGE_TRUE_VALUE;
11833       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
11834       /* Put STORE_BB to likely part.  */
11835       efalse->probability = profile_probability::likely ();
11836       e->probability = efalse->probability.invert ();
11837       store_bb->count = efalse->count ();
11838       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
11839       if (dom_info_available_p (CDI_DOMINATORS))
11840         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
11841       if (dump_enabled_p ())
11842         dump_printf_loc (MSG_NOTE, vect_location,
11843                          "Create new block %d to sink mask stores.",
11844                          store_bb->index);
11845       /* Create vector comparison with boolean result.  */
11846       vectype = TREE_TYPE (mask);
11847       zero = build_zero_cst (vectype);
11848       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
11849       gsi = gsi_last_bb (bb);
11850       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
11851       /* Create new PHI node for vdef of the last masked store:
11852          .MEM_2 = VDEF <.MEM_1>
11853          will be converted to
11854          .MEM.3 = VDEF <.MEM_1>
11855          and new PHI node will be created in join bb
11856          .MEM_2 = PHI <.MEM_1, .MEM_3>
11857       */
11858       vdef = gimple_vdef (last);
11859       new_vdef = make_ssa_name (gimple_vop (cfun), last);
11860       gimple_set_vdef (last, new_vdef);
11861       phi = create_phi_node (vdef, join_bb);
11862       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
11863
11864       /* Put all masked stores with the same mask to STORE_BB if possible.  */
11865       while (true)
11866         {
11867           gimple_stmt_iterator gsi_from;
11868           gimple *stmt1 = NULL;
11869
11870           /* Move masked store to STORE_BB.  */
11871           last_store = last;
11872           gsi = gsi_for_stmt (last);
11873           gsi_from = gsi;
11874           /* Shift GSI to the previous stmt for further traversal.  */
11875           gsi_prev (&gsi);
11876           gsi_to = gsi_start_bb (store_bb);
11877           gsi_move_before (&gsi_from, &gsi_to);
11878           /* Setup GSI_TO to the non-empty block start.  */
11879           gsi_to = gsi_start_bb (store_bb);
11880           if (dump_enabled_p ())
11881             dump_printf_loc (MSG_NOTE, vect_location,
11882                              "Move stmt to created bb\n%G", last);
11883           /* Move all stored value producers if possible.  */
11884           while (!gsi_end_p (gsi))
11885             {
11886               tree lhs;
11887               imm_use_iterator imm_iter;
11888               use_operand_p use_p;
11889               bool res;
11890
11891               /* Skip debug statements.  */
11892               if (is_gimple_debug (gsi_stmt (gsi)))
11893                 {
11894                   gsi_prev (&gsi);
11895                   continue;
11896                 }
11897               stmt1 = gsi_stmt (gsi);
11898               /* Do not consider statements writing to memory or having
11899                  volatile operand.  */
11900               if (gimple_vdef (stmt1)
11901                   || gimple_has_volatile_ops (stmt1))
11902                 break;
11903               gsi_from = gsi;
11904               gsi_prev (&gsi);
11905               lhs = gimple_get_lhs (stmt1);
11906               if (!lhs)
11907                 break;
11908
11909               /* LHS of vectorized stmt must be SSA_NAME.  */
11910               if (TREE_CODE (lhs) != SSA_NAME)
11911                 break;
11912
11913               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11914                 {
11915                   /* Remove dead scalar statement.  */
11916                   if (has_zero_uses (lhs))
11917                     {
11918                       gsi_remove (&gsi_from, true);
11919                       continue;
11920                     }
11921                 }
11922
11923               /* Check that LHS does not have uses outside of STORE_BB.  */
11924               res = true;
11925               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
11926                 {
11927                   gimple *use_stmt;
11928                   use_stmt = USE_STMT (use_p);
11929                   if (is_gimple_debug (use_stmt))
11930                     continue;
11931                   if (gimple_bb (use_stmt) != store_bb)
11932                     {
11933                       res = false;
11934                       break;
11935                     }
11936                 }
11937               if (!res)
11938                 break;
11939
11940               if (gimple_vuse (stmt1)
11941                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
11942                 break;
11943
11944               /* Can move STMT1 to STORE_BB.  */
11945               if (dump_enabled_p ())
11946                 dump_printf_loc (MSG_NOTE, vect_location,
11947                                  "Move stmt to created bb\n%G", stmt1);
11948               gsi_move_before (&gsi_from, &gsi_to);
11949               /* Shift GSI_TO for further insertion.  */
11950               gsi_prev (&gsi_to);
11951             }
11952           /* Put other masked stores with the same mask to STORE_BB.  */
11953           if (worklist.is_empty ()
11954               || gimple_call_arg (worklist.last (), 2) != mask
11955               || worklist.last () != stmt1)
11956             break;
11957           last = worklist.pop ();
11958         }
11959       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
11960     }
11961 }
11962
11963 /* Decide whether it is possible to use a zero-based induction variable
11964    when vectorizing LOOP_VINFO with partial vectors.  If it is, return
11965    the value that the induction variable must be able to hold in order
11966    to ensure that the rgroups eventually have no active vector elements.
11967    Return -1 otherwise.  */
11968
11969 widest_int
11970 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
11971 {
11972   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
11973   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11974   unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
11975
11976   /* Calculate the value that the induction variable must be able
11977      to hit in order to ensure that we end the loop with an all-false mask.
11978      This involves adding the maximum number of inactive trailing scalar
11979      iterations.  */
11980   widest_int iv_limit = -1;
11981   if (max_loop_iterations (loop, &iv_limit))
11982     {
11983       if (niters_skip)
11984         {
11985           /* Add the maximum number of skipped iterations to the
11986              maximum iteration count.  */
11987           if (TREE_CODE (niters_skip) == INTEGER_CST)
11988             iv_limit += wi::to_widest (niters_skip);
11989           else
11990             iv_limit += max_vf - 1;
11991         }
11992       else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
11993         /* Make a conservatively-correct assumption.  */
11994         iv_limit += max_vf - 1;
11995
11996       /* IV_LIMIT is the maximum number of latch iterations, which is also
11997          the maximum in-range IV value.  Round this value down to the previous
11998          vector alignment boundary and then add an extra full iteration.  */
11999       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
12000       iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
12001     }
12002   return iv_limit;
12003 }
12004
12005 /* For the given rgroup_controls RGC, check whether an induction variable
12006    would ever hit a value that produces a set of all-false masks or zero
12007    lengths before wrapping around.  Return true if it's possible to wrap
12008    around before hitting the desirable value, otherwise return false.  */
12009
12010 bool
12011 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
12012 {
12013   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
12014
12015   if (iv_limit == -1)
12016     return true;
12017
12018   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
12019   unsigned int compare_precision = TYPE_PRECISION (compare_type);
12020   unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
12021
12022   if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
12023     return true;
12024
12025   return false;
12026 }