gcc/tree-vect-loop.cc

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2023 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #define INCLUDE_ALGORITHM
  23 #include "config.h"
  24 #include "system.h"
  25 #include "coretypes.h"
  26 #include "backend.h"
  27 #include "target.h"
  28 #include "rtl.h"
  29 #include "tree.h"
  30 #include "gimple.h"
  31 #include "cfghooks.h"
  32 #include "tree-pass.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "memmodel.h"
  36 #include "optabs.h"
  37 #include "diagnostic-core.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "cfganal.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop-niter.h"
  47 #include "tree-ssa-loop.h"
  48 #include "cfgloop.h"
  49 #include "tree-scalar-evolution.h"
  50 #include "tree-vectorizer.h"
  51 #include "gimple-fold.h"
  52 #include "cgraph.h"
  53 #include "tree-cfg.h"
  54 #include "tree-if-conv.h"
  55 #include "internal-fn.h"
  56 #include "tree-vector-builder.h"
  57 #include "vec-perm-indices.h"
  58 #include "tree-eh.h"
  59 #include "case-cfn-macros.h"
  60 #include "langhooks.h"
  61
  62 /* Loop Vectorization Pass.
  63
  64    This pass tries to vectorize loops.
  65
  66    For example, the vectorizer transforms the following simple loop:
  67
  68         short a[N]; short b[N]; short c[N]; int i;
  69
  70         for (i=0; i<N; i++){
  71           a[i] = b[i] + c[i];
  72         }
  73
  74    as if it was manually vectorized by rewriting the source code into:
  75
  76         typedef int __attribute__((mode(V8HI))) v8hi;
  77         short a[N];  short b[N]; short c[N];   int i;
  78         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  79         v8hi va, vb, vc;
  80
  81         for (i=0; i<N/8; i++){
  82           vb = pb[i];
  83           vc = pc[i];
  84           va = vb + vc;
  85           pa[i] = va;
  86         }
  87
  88         The main entry to this pass is vectorize_loops(), in which
  89    the vectorizer applies a set of analyses on a given set of loops,
  90    followed by the actual vectorization transformation for the loops that
  91    had successfully passed the analysis phase.
  92         Throughout this pass we make a distinction between two types of
  93    data: scalars (which are represented by SSA_NAMES), and memory references
  94    ("data-refs").  These two types of data require different handling both
  95    during analysis and transformation. The types of data-refs that the
  96    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  97    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  98    accesses are required to have a simple (consecutive) access pattern.
  99
 100    Analysis phase:
 101    ===============
 102         The driver for the analysis phase is vect_analyze_loop().
 103    It applies a set of analyses, some of which rely on the scalar evolution
 104    analyzer (scev) developed by Sebastian Pop.
 105
 106         During the analysis phase the vectorizer records some information
 107    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 108    loop, as well as general information about the loop as a whole, which is
 109    recorded in a "loop_vec_info" struct attached to each loop.
 110
 111    Transformation phase:
 112    =====================
 113         The loop transformation phase scans all the stmts in the loop, and
 114    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 115    the loop that needs to be vectorized.  It inserts the vector code sequence
 116    just before the scalar stmt S, and records a pointer to the vector code
 117    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 118    attached to S).  This pointer will be used for the vectorization of following
 119    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 120    otherwise, we rely on dead code elimination for removing it.
 121
 122         For example, say stmt S1 was vectorized into stmt VS1:
 123
 124    VS1: vb = px[i];
 125    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 126    S2:  a = b;
 127
 128    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 129    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 130    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 131    resulting sequence would be:
 132
 133    VS1: vb = px[i];
 134    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 135    VS2: va = vb;
 136    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 137
 138         Operands that are not SSA_NAMEs, are data-refs that appear in
 139    load/store operations (like 'x[i]' in S1), and are handled differently.
 140
 141    Target modeling:
 142    =================
 143         Currently the only target specific information that is used is the
 144    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 145    Targets that can support different sizes of vectors, for now will need
 146    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 147    flexibility will be added in the future.
 148
 149         Since we only vectorize operations which vector form can be
 150    expressed using existing tree codes, to verify that an operation is
 151    supported, the vectorizer checks the relevant optab at the relevant
 152    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 153    the value found is CODE_FOR_nothing, then there's no target support, and
 154    we can't vectorize the stmt.
 155
 156    For additional information on this project see:
 157    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 158 */
 159
 160 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
 161                                                 unsigned *);
 162 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
 163                                                bool *, bool *, bool);
 164
 165 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 166    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 167    may already be set for general statements (not just data refs).  */
 168
 169 static opt_result
 170 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
 171                               bool vectype_maybe_set_p,
 172                               poly_uint64 *vf)
 173 {
 174   gimple *stmt = stmt_info->stmt;
 175
 176   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 177        && !STMT_VINFO_LIVE_P (stmt_info))
 178       || gimple_clobber_p (stmt))
 179     {
 180       if (dump_enabled_p ())
 181         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 182       return opt_result::success ();
 183     }
 184
 185   tree stmt_vectype, nunits_vectype;
 186   opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
 187                                                    &stmt_vectype,
 188                                                    &nunits_vectype);
 189   if (!res)
 190     return res;
 191
 192   if (stmt_vectype)
 193     {
 194       if (STMT_VINFO_VECTYPE (stmt_info))
 195         /* The only case when a vectype had been already set is for stmts
 196            that contain a data ref, or for "pattern-stmts" (stmts generated
 197            by the vectorizer to represent/replace a certain idiom).  */
 198         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 199                      || vectype_maybe_set_p)
 200                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 201       else
 202         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 203     }
 204
 205   if (nunits_vectype)
 206     vect_update_max_nunits (vf, nunits_vectype);
 207
 208   return opt_result::success ();
 209 }
 210
 211 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 212    types of STMT_INFO and all attached pattern statements and update
 213    the vectorization factor VF accordingly.  Return true on success
 214    or false if something prevented vectorization.  */
 215
 216 static opt_result
 217 vect_determine_vf_for_stmt (vec_info *vinfo,
 218                             stmt_vec_info stmt_info, poly_uint64 *vf)
 219 {
 220   if (dump_enabled_p ())
 221     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
 222                      stmt_info->stmt);
 223   opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
 224   if (!res)
 225     return res;
 226
 227   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 228       && STMT_VINFO_RELATED_STMT (stmt_info))
 229     {
 230       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 231       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
 232
 233       /* If a pattern statement has def stmts, analyze them too.  */
 234       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 235            !gsi_end_p (si); gsi_next (&si))
 236         {
 237           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 238           if (dump_enabled_p ())
 239             dump_printf_loc (MSG_NOTE, vect_location,
 240                              "==> examining pattern def stmt: %G",
 241                              def_stmt_info->stmt);
 242           res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
 243           if (!res)
 244             return res;
 245         }
 246
 247       if (dump_enabled_p ())
 248         dump_printf_loc (MSG_NOTE, vect_location,
 249                          "==> examining pattern statement: %G",
 250                          stmt_info->stmt);
 251       res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
 252       if (!res)
 253         return res;
 254     }
 255
 256   return opt_result::success ();
 257 }
 258
 259 /* Function vect_determine_vectorization_factor
 260
 261    Determine the vectorization factor (VF).  VF is the number of data elements
 262    that are operated upon in parallel in a single iteration of the vectorized
 263    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 264    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 265    elements can fit in a single vector register.
 266
 267    We currently support vectorization of loops in which all types operated upon
 268    are of the same size.  Therefore this function currently sets VF according to
 269    the size of the types operated upon, and fails if there are multiple sizes
 270    in the loop.
 271
 272    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 273    original loop:
 274         for (i=0; i<N; i++){
 275           a[i] = b[i] + c[i];
 276         }
 277
 278    vectorized loop:
 279         for (i=0; i<N; i+=VF){
 280           a[i:VF] = b[i:VF] + c[i:VF];
 281         }
 282 */
 283
 284 static opt_result
 285 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 286 {
 287   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 288   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 289   unsigned nbbs = loop->num_nodes;
 290   poly_uint64 vectorization_factor = 1;
 291   tree scalar_type = NULL_TREE;
 292   gphi *phi;
 293   tree vectype;
 294   stmt_vec_info stmt_info;
 295   unsigned i;
 296
 297   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 298
 299   for (i = 0; i < nbbs; i++)
 300     {
 301       basic_block bb = bbs[i];
 302
 303       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 304            gsi_next (&si))
 305         {
 306           phi = si.phi ();
 307           stmt_info = loop_vinfo->lookup_stmt (phi);
 308           if (dump_enabled_p ())
 309             dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
 310                              (gimple *) phi);
 311
 312           gcc_assert (stmt_info);
 313
 314           if (STMT_VINFO_RELEVANT_P (stmt_info)
 315               || STMT_VINFO_LIVE_P (stmt_info))
 316             {
 317               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 318               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 319
 320               if (dump_enabled_p ())
 321                 dump_printf_loc (MSG_NOTE, vect_location,
 322                                  "get vectype for scalar type:  %T\n",
 323                                  scalar_type);
 324
 325               vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 326               if (!vectype)
 327                 return opt_result::failure_at (phi,
 328                                                "not vectorized: unsupported "
 329                                                "data-type %T\n",
 330                                                scalar_type);
 331               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 332
 333               if (dump_enabled_p ())
 334                 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
 335                                  vectype);
 336
 337               if (dump_enabled_p ())
 338                 {
 339                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 340                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 341                   dump_printf (MSG_NOTE, "\n");
 342                 }
 343
 344               vect_update_max_nunits (&vectorization_factor, vectype);
 345             }
 346         }
 347
 348       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 349            gsi_next (&si))
 350         {
 351           if (is_gimple_debug (gsi_stmt (si)))
 352             continue;
 353           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 354           opt_result res
 355             = vect_determine_vf_for_stmt (loop_vinfo,
 356                                           stmt_info, &vectorization_factor);
 357           if (!res)
 358             return res;
 359         }
 360     }
 361
 362   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 363   if (dump_enabled_p ())
 364     {
 365       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 366       dump_dec (MSG_NOTE, vectorization_factor);
 367       dump_printf (MSG_NOTE, "\n");
 368     }
 369
 370   if (known_le (vectorization_factor, 1U))
 371     return opt_result::failure_at (vect_location,
 372                                    "not vectorized: unsupported data-type\n");
 373   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 374   return opt_result::success ();
 375 }
 376
 377
 378 /* Function vect_is_simple_iv_evolution.
 379
 380    FORNOW: A simple evolution of an induction variables in the loop is
 381    considered a polynomial evolution.  */
 382
 383 static bool
 384 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 385                              tree * step)
 386 {
 387   tree init_expr;
 388   tree step_expr;
 389   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 390   basic_block bb;
 391
 392   /* When there is no evolution in this loop, the evolution function
 393      is not "simple".  */
 394   if (evolution_part == NULL_TREE)
 395     return false;
 396
 397   /* When the evolution is a polynomial of degree >= 2
 398      the evolution function is not "simple".  */
 399   if (tree_is_chrec (evolution_part))
 400     return false;
 401
 402   step_expr = evolution_part;
 403   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 404
 405   if (dump_enabled_p ())
 406     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
 407                      step_expr, init_expr);
 408
 409   *init = init_expr;
 410   *step = step_expr;
 411
 412   if (TREE_CODE (step_expr) != INTEGER_CST
 413       && (TREE_CODE (step_expr) != SSA_NAME
 414           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 415               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 416           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 417               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 418                   || !flag_associative_math)))
 419       && (TREE_CODE (step_expr) != REAL_CST
 420           || !flag_associative_math))
 421     {
 422       if (dump_enabled_p ())
 423         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 424                          "step unknown.\n");
 425       return false;
 426     }
 427
 428   return true;
 429 }
 430
 431 /* Function vect_is_nonlinear_iv_evolution
 432
 433    Only support nonlinear induction for integer type
 434    1. neg
 435    2. mul by constant
 436    3. lshift/rshift by constant.
 437
 438    For neg induction, return a fake step as integer -1.  */
 439 static bool
 440 vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
 441                                 gphi* loop_phi_node, tree *init, tree *step)
 442 {
 443   tree init_expr, ev_expr, result, op1, op2;
 444   gimple* def;
 445
 446   if (gimple_phi_num_args (loop_phi_node) != 2)
 447     return false;
 448
 449   init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop));
 450   ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop));
 451
 452   /* Support nonlinear induction only for integer type.  */
 453   if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
 454     return false;
 455
 456   *init = init_expr;
 457   result = PHI_RESULT (loop_phi_node);
 458
 459   if (TREE_CODE (ev_expr) != SSA_NAME
 460       || ((def = SSA_NAME_DEF_STMT (ev_expr)), false)
 461       || !is_gimple_assign (def))
 462     return false;
 463
 464   enum tree_code t_code = gimple_assign_rhs_code (def);
 465   switch (t_code)
 466     {
 467     case NEGATE_EXPR:
 468       if (gimple_assign_rhs1 (def) != result)
 469         return false;
 470       *step = build_int_cst (TREE_TYPE (init_expr), -1);
 471       STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
 472       break;
 473
 474     case RSHIFT_EXPR:
 475     case LSHIFT_EXPR:
 476     case MULT_EXPR:
 477       op1 = gimple_assign_rhs1 (def);
 478       op2 = gimple_assign_rhs2 (def);
 479       if (TREE_CODE (op2) != INTEGER_CST
 480           || op1 != result)
 481         return false;
 482       *step = op2;
 483       if (t_code == LSHIFT_EXPR)
 484         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
 485       else if (t_code == RSHIFT_EXPR)
 486         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
 487       /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul.  */
 488       else
 489         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
 490       break;
 491
 492     default:
 493       return false;
 494     }
 495
 496   STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = *init;
 497   STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = *step;
 498
 499   return true;
 500 }
 501
 502 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
 503    what we are assuming is a double reduction.  For example, given
 504    a structure like this:
 505
 506       outer1:
 507         x_1 = PHI <x_4(outer2), ...>;
 508         ...
 509
 510       inner:
 511         x_2 = PHI <x_1(outer1), ...>;
 512         ...
 513         x_3 = ...;
 514         ...
 515
 516       outer2:
 517         x_4 = PHI <x_3(inner)>;
 518         ...
 519
 520    outer loop analysis would treat x_1 as a double reduction phi and
 521    this function would then return true for x_2.  */
 522
 523 static bool
 524 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
 525 {
 526   use_operand_p use_p;
 527   ssa_op_iter op_iter;
 528   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
 529     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
 530       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
 531         return true;
 532   return false;
 533 }
 534
 535 /* Returns true if Phi is a first-order recurrence. A first-order
 536    recurrence is a non-reduction recurrence relation in which the value of
 537    the recurrence in the current loop iteration equals a value defined in
 538    the previous iteration.  */
 539
 540 static bool
 541 vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop,
 542                                    gphi *phi)
 543 {
 544   /* A nested cycle isn't vectorizable as first order recurrence.  */
 545   if (LOOP_VINFO_LOOP (loop_vinfo) != loop)
 546     return false;
 547
 548   /* Ensure the loop latch definition is from within the loop.  */
 549   edge latch = loop_latch_edge (loop);
 550   tree ldef = PHI_ARG_DEF_FROM_EDGE (phi, latch);
 551   if (TREE_CODE (ldef) != SSA_NAME
 552       || SSA_NAME_IS_DEFAULT_DEF (ldef)
 553       || is_a <gphi *> (SSA_NAME_DEF_STMT (ldef))
 554       || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (ldef))))
 555     return false;
 556
 557   tree def = gimple_phi_result (phi);
 558
 559   /* Ensure every use_stmt of the phi node is dominated by the latch
 560      definition.  */
 561   imm_use_iterator imm_iter;
 562   use_operand_p use_p;
 563   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
 564     if (!is_gimple_debug (USE_STMT (use_p))
 565         && (SSA_NAME_DEF_STMT (ldef) == USE_STMT (use_p)
 566             || !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (ldef),
 567                                             USE_STMT (use_p))))
 568       return false;
 569
 570   /* First-order recurrence autovectorization needs shuffle vector.  */
 571   tree scalar_type = TREE_TYPE (def);
 572   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 573   if (!vectype)
 574     return false;
 575
 576   return true;
 577 }
 578
 579 /* Function vect_analyze_scalar_cycles_1.
 580
 581    Examine the cross iteration def-use cycles of scalar variables
 582    in LOOP.  LOOP_VINFO represents the loop that is now being
 583    considered for vectorization (can be LOOP, or an outer-loop
 584    enclosing LOOP).  SLP indicates there will be some subsequent
 585    slp analyses or not.  */
 586
 587 static void
 588 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop,
 589                               bool slp)
 590 {
 591   basic_block bb = loop->header;
 592   tree init, step;
 593   auto_vec<stmt_vec_info, 64> worklist;
 594   gphi_iterator gsi;
 595   bool double_reduc, reduc_chain;
 596
 597   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 598
 599   /* First - identify all inductions.  Reduction detection assumes that all the
 600      inductions have been identified, therefore, this order must not be
 601      changed.  */
 602   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 603     {
 604       gphi *phi = gsi.phi ();
 605       tree access_fn = NULL;
 606       tree def = PHI_RESULT (phi);
 607       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 608
 609       if (dump_enabled_p ())
 610         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
 611                          (gimple *) phi);
 612
 613       /* Skip virtual phi's.  The data dependences that are associated with
 614          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 615       if (virtual_operand_p (def))
 616         continue;
 617
 618       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 619
 620       /* Analyze the evolution function.  */
 621       access_fn = analyze_scalar_evolution (loop, def);
 622       if (access_fn)
 623         {
 624           STRIP_NOPS (access_fn);
 625           if (dump_enabled_p ())
 626             dump_printf_loc (MSG_NOTE, vect_location,
 627                              "Access function of PHI: %T\n", access_fn);
 628           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 629             = initial_condition_in_loop_num (access_fn, loop->num);
 630           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 631             = evolution_part_in_loop_num (access_fn, loop->num);
 632         }
 633
 634       if ((!access_fn
 635            || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
 636            || !vect_is_simple_iv_evolution (loop->num, access_fn,
 637                                             &init, &step)
 638            || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 639                && TREE_CODE (step) != INTEGER_CST))
 640           /* Only handle nonlinear iv for same loop.  */
 641           && (LOOP_VINFO_LOOP (loop_vinfo) != loop
 642               || !vect_is_nonlinear_iv_evolution (loop, stmt_vinfo,
 643                                                   phi, &init, &step)))
 644         {
 645           worklist.safe_push (stmt_vinfo);
 646           continue;
 647         }
 648
 649       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 650                   != NULL_TREE);
 651       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 652
 653       if (dump_enabled_p ())
 654         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 655       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 656     }
 657
 658
 659   /* Second - identify all reductions and nested cycles.  */
 660   while (worklist.length () > 0)
 661     {
 662       stmt_vec_info stmt_vinfo = worklist.pop ();
 663       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
 664       tree def = PHI_RESULT (phi);
 665
 666       if (dump_enabled_p ())
 667         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
 668                          (gimple *) phi);
 669
 670       gcc_assert (!virtual_operand_p (def)
 671                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 672
 673       stmt_vec_info reduc_stmt_info
 674         = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
 675                                     &reduc_chain, slp);
 676       if (reduc_stmt_info)
 677         {
 678           STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
 679           STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
 680           if (double_reduc)
 681             {
 682               if (dump_enabled_p ())
 683                 dump_printf_loc (MSG_NOTE, vect_location,
 684                                  "Detected double reduction.\n");
 685
 686               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 687               STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
 688             }
 689           else
 690             {
 691               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 692                 {
 693                   if (dump_enabled_p ())
 694                     dump_printf_loc (MSG_NOTE, vect_location,
 695                                      "Detected vectorizable nested cycle.\n");
 696
 697                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 698                 }
 699               else
 700                 {
 701                   if (dump_enabled_p ())
 702                     dump_printf_loc (MSG_NOTE, vect_location,
 703                                      "Detected reduction.\n");
 704
 705                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 706                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
 707                   /* Store the reduction cycles for possible vectorization in
 708                      loop-aware SLP if it was not detected as reduction
 709                      chain.  */
 710                   if (! reduc_chain)
 711                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
 712                       (reduc_stmt_info);
 713                 }
 714             }
 715         }
 716       else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi))
 717         STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence;
 718       else
 719         if (dump_enabled_p ())
 720           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 721                            "Unknown def-use cycle pattern.\n");
 722     }
 723 }
 724
 725
 726 /* Function vect_analyze_scalar_cycles.
 727
 728    Examine the cross iteration def-use cycles of scalar variables, by
 729    analyzing the loop-header PHIs of scalar variables.  Classify each
 730    cycle as one of the following: invariant, induction, reduction, unknown.
 731    We do that for the loop represented by LOOP_VINFO, and also to its
 732    inner-loop, if exists.
 733    Examples for scalar cycles:
 734
 735    Example1: reduction:
 736
 737               loop1:
 738               for (i=0; i<N; i++)
 739                  sum += a[i];
 740
 741    Example2: induction:
 742
 743               loop2:
 744               for (i=0; i<N; i++)
 745                  a[i] = i;  */
 746
 747 static void
 748 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo, bool slp)
 749 {
 750   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 751
 752   vect_analyze_scalar_cycles_1 (loop_vinfo, loop, slp);
 753
 754   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 755      Reductions in such inner-loop therefore have different properties than
 756      the reductions in the nest that gets vectorized:
 757      1. When vectorized, they are executed in the same order as in the original
 758         scalar loop, so we can't change the order of computation when
 759         vectorizing them.
 760      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 761         current checks are too strict.  */
 762
 763   if (loop->inner)
 764     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner, slp);
 765 }
 766
 767 /* Transfer group and reduction information from STMT_INFO to its
 768    pattern stmt.  */
 769
 770 static void
 771 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
 772 {
 773   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
 774   stmt_vec_info stmtp;
 775   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
 776               && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
 777   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
 778   do
 779     {
 780       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
 781       gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
 782                            == STMT_VINFO_DEF_TYPE (stmt_info));
 783       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
 784       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
 785       if (stmt_info)
 786         REDUC_GROUP_NEXT_ELEMENT (stmtp)
 787           = STMT_VINFO_RELATED_STMT (stmt_info);
 788     }
 789   while (stmt_info);
 790 }
 791
 792 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 793
 794 static void
 795 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 796 {
 797   stmt_vec_info first;
 798   unsigned i;
 799
 800   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 801     {
 802       stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
 803       while (next)
 804         {
 805           if ((STMT_VINFO_IN_PATTERN_P (next)
 806                != STMT_VINFO_IN_PATTERN_P (first))
 807               || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
 808             break;
 809           next = REDUC_GROUP_NEXT_ELEMENT (next);
 810         }
 811       /* If all reduction chain members are well-formed patterns adjust
 812          the group to group the pattern stmts instead.  */
 813       if (! next
 814           && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
 815         {
 816           if (STMT_VINFO_IN_PATTERN_P (first))
 817             {
 818               vect_fixup_reduc_chain (first);
 819               LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 820                 = STMT_VINFO_RELATED_STMT (first);
 821             }
 822         }
 823       /* If not all stmt in the chain are patterns or if we failed
 824          to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
 825          it as regular reduction instead.  */
 826       else
 827         {
 828           stmt_vec_info vinfo = first;
 829           stmt_vec_info last = NULL;
 830           while (vinfo)
 831             {
 832               next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
 833               REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
 834               REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
 835               last = vinfo;
 836               vinfo = next;
 837             }
 838           STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
 839             = vect_internal_def;
 840           loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
 841           LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
 842           --i;
 843         }
 844     }
 845 }
 846
 847 /* Function vect_get_loop_niters.
 848
 849    Determine how many iterations the loop is executed and place it
 850    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 851    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 852    niter information holds in ASSUMPTIONS.
 853
 854    Return the loop exit condition.  */
 855
 856
 857 static gcond *
 858 vect_get_loop_niters (class loop *loop, tree *assumptions,
 859                       tree *number_of_iterations, tree *number_of_iterationsm1)
 860 {
 861   edge exit = single_exit (loop);
 862   class tree_niter_desc niter_desc;
 863   tree niter_assumptions, niter, may_be_zero;
 864   gcond *cond = get_loop_exit_condition (loop);
 865
 866   *assumptions = boolean_true_node;
 867   *number_of_iterationsm1 = chrec_dont_know;
 868   *number_of_iterations = chrec_dont_know;
 869   DUMP_VECT_SCOPE ("get_loop_niters");
 870
 871   if (!exit)
 872     return cond;
 873
 874   may_be_zero = NULL_TREE;
 875   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 876       || chrec_contains_undetermined (niter_desc.niter))
 877     return cond;
 878
 879   niter_assumptions = niter_desc.assumptions;
 880   may_be_zero = niter_desc.may_be_zero;
 881   niter = niter_desc.niter;
 882
 883   if (may_be_zero && integer_zerop (may_be_zero))
 884     may_be_zero = NULL_TREE;
 885
 886   if (may_be_zero)
 887     {
 888       if (COMPARISON_CLASS_P (may_be_zero))
 889         {
 890           /* Try to combine may_be_zero with assumptions, this can simplify
 891              computation of niter expression.  */
 892           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 893             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 894                                              niter_assumptions,
 895                                              fold_build1 (TRUTH_NOT_EXPR,
 896                                                           boolean_type_node,
 897                                                           may_be_zero));
 898           else
 899             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 900                                  build_int_cst (TREE_TYPE (niter), 0),
 901                                  rewrite_to_non_trapping_overflow (niter));
 902
 903           may_be_zero = NULL_TREE;
 904         }
 905       else if (integer_nonzerop (may_be_zero))
 906         {
 907           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 908           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 909           return cond;
 910         }
 911       else
 912         return cond;
 913     }
 914
 915   *assumptions = niter_assumptions;
 916   *number_of_iterationsm1 = niter;
 917
 918   /* We want the number of loop header executions which is the number
 919      of latch executions plus one.
 920      ???  For UINT_MAX latch executions this number overflows to zero
 921      for loops like do { n++; } while (n != 0);  */
 922   if (niter && !chrec_contains_undetermined (niter))
 923     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
 924                           build_int_cst (TREE_TYPE (niter), 1));
 925   *number_of_iterations = niter;
 926
 927   return cond;
 928 }
 929
 930 /* Function bb_in_loop_p
 931
 932    Used as predicate for dfs order traversal of the loop bbs.  */
 933
 934 static bool
 935 bb_in_loop_p (const_basic_block bb, const void *data)
 936 {
 937   const class loop *const loop = (const class loop *)data;
 938   if (flow_bb_inside_loop_p (loop, bb))
 939     return true;
 940   return false;
 941 }
 942
 943
 944 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
 945    stmt_vec_info structs for all the stmts in LOOP_IN.  */
 946
 947 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
 948   : vec_info (vec_info::loop, shared),
 949     loop (loop_in),
 950     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
 951     num_itersm1 (NULL_TREE),
 952     num_iters (NULL_TREE),
 953     num_iters_unchanged (NULL_TREE),
 954     num_iters_assumptions (NULL_TREE),
 955     vector_costs (nullptr),
 956     scalar_costs (nullptr),
 957     th (0),
 958     versioning_threshold (0),
 959     vectorization_factor (0),
 960     main_loop_edge (nullptr),
 961     skip_main_loop_edge (nullptr),
 962     skip_this_loop_edge (nullptr),
 963     reusable_accumulators (),
 964     suggested_unroll_factor (1),
 965     max_vectorization_factor (0),
 966     mask_skip_niters (NULL_TREE),
 967     rgroup_compare_type (NULL_TREE),
 968     simd_if_cond (NULL_TREE),
 969     partial_vector_style (vect_partial_vectors_none),
 970     unaligned_dr (NULL),
 971     peeling_for_alignment (0),
 972     ptr_mask (0),
 973     ivexpr_map (NULL),
 974     scan_map (NULL),
 975     slp_unrolling_factor (1),
 976     inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
 977     vectorizable (false),
 978     can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
 979     using_partial_vectors_p (false),
 980     using_decrementing_iv_p (false),
 981     using_select_vl_p (false),
 982     epil_using_partial_vectors_p (false),
 983     partial_load_store_bias (0),
 984     peeling_for_gaps (false),
 985     peeling_for_niter (false),
 986     no_data_dependencies (false),
 987     has_mask_store (false),
 988     scalar_loop_scaling (profile_probability::uninitialized ()),
 989     scalar_loop (NULL),
 990     orig_loop_info (NULL)
 991 {
 992   /* CHECKME: We want to visit all BBs before their successors (except for
 993      latch blocks, for which this assertion wouldn't hold).  In the simple
 994      case of the loop forms we allow, a dfs order of the BBs would the same
 995      as reversed postorder traversal, so we are safe.  */
 996
 997   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 998                                           bbs, loop->num_nodes, loop);
 999   gcc_assert (nbbs == loop->num_nodes);
1000
1001   for (unsigned int i = 0; i < nbbs; i++)
1002     {
1003       basic_block bb = bbs[i];
1004       gimple_stmt_iterator si;
1005
1006       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1007         {
1008           gimple *phi = gsi_stmt (si);
1009           gimple_set_uid (phi, 0);
1010           add_stmt (phi);
1011         }
1012
1013       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1014         {
1015           gimple *stmt = gsi_stmt (si);
1016           gimple_set_uid (stmt, 0);
1017           if (is_gimple_debug (stmt))
1018             continue;
1019           add_stmt (stmt);
1020           /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
1021              third argument is the #pragma omp simd if (x) condition, when 0,
1022              loop shouldn't be vectorized, when non-zero constant, it should
1023              be vectorized normally, otherwise versioned with vectorized loop
1024              done if the condition is non-zero at runtime.  */
1025           if (loop_in->simduid
1026               && is_gimple_call (stmt)
1027               && gimple_call_internal_p (stmt)
1028               && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
1029               && gimple_call_num_args (stmt) >= 3
1030               && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
1031               && (loop_in->simduid
1032                   == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
1033             {
1034               tree arg = gimple_call_arg (stmt, 2);
1035               if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
1036                 simd_if_cond = arg;
1037               else
1038                 gcc_assert (integer_nonzerop (arg));
1039             }
1040         }
1041     }
1042
1043   epilogue_vinfos.create (6);
1044 }
1045
1046 /* Free all levels of rgroup CONTROLS.  */
1047
1048 void
1049 release_vec_loop_controls (vec<rgroup_controls> *controls)
1050 {
1051   rgroup_controls *rgc;
1052   unsigned int i;
1053   FOR_EACH_VEC_ELT (*controls, i, rgc)
1054     rgc->controls.release ();
1055   controls->release ();
1056 }
1057
1058 /* Free all memory used by the _loop_vec_info, as well as all the
1059    stmt_vec_info structs of all the stmts in the loop.  */
1060
1061 _loop_vec_info::~_loop_vec_info ()
1062 {
1063   free (bbs);
1064
1065   release_vec_loop_controls (&masks.rgc_vec);
1066   release_vec_loop_controls (&lens);
1067   delete ivexpr_map;
1068   delete scan_map;
1069   epilogue_vinfos.release ();
1070   delete scalar_costs;
1071   delete vector_costs;
1072
1073   /* When we release an epiloge vinfo that we do not intend to use
1074      avoid clearing AUX of the main loop which should continue to
1075      point to the main loop vinfo since otherwise we'll leak that.  */
1076   if (loop->aux == this)
1077     loop->aux = NULL;
1078 }
1079
1080 /* Return an invariant or register for EXPR and emit necessary
1081    computations in the LOOP_VINFO loop preheader.  */
1082
1083 tree
1084 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
1085 {
1086   if (is_gimple_reg (expr)
1087       || is_gimple_min_invariant (expr))
1088     return expr;
1089
1090   if (! loop_vinfo->ivexpr_map)
1091     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
1092   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
1093   if (! cached)
1094     {
1095       gimple_seq stmts = NULL;
1096       cached = force_gimple_operand (unshare_expr (expr),
1097                                      &stmts, true, NULL_TREE);
1098       if (stmts)
1099         {
1100           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1101           gsi_insert_seq_on_edge_immediate (e, stmts);
1102         }
1103     }
1104   return cached;
1105 }
1106
1107 /* Return true if we can use CMP_TYPE as the comparison type to produce
1108    all masks required to mask LOOP_VINFO.  */
1109
1110 static bool
1111 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1112 {
1113   rgroup_controls *rgm;
1114   unsigned int i;
1115   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1116     if (rgm->type != NULL_TREE
1117         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1118                                             cmp_type, rgm->type,
1119                                             OPTIMIZE_FOR_SPEED))
1120       return false;
1121   return true;
1122 }
1123
1124 /* Calculate the maximum number of scalars per iteration for every
1125    rgroup in LOOP_VINFO.  */
1126
1127 static unsigned int
1128 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1129 {
1130   unsigned int res = 1;
1131   unsigned int i;
1132   rgroup_controls *rgm;
1133   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1134     res = MAX (res, rgm->max_nscalars_per_iter);
1135   return res;
1136 }
1137
1138 /* Calculate the minimum precision necessary to represent:
1139
1140       MAX_NITERS * FACTOR
1141
1142    as an unsigned integer, where MAX_NITERS is the maximum number of
1143    loop header iterations for the original scalar form of LOOP_VINFO.  */
1144
1145 static unsigned
1146 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1147 {
1148   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1149
1150   /* Get the maximum number of iterations that is representable
1151      in the counter type.  */
1152   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1153   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1154
1155   /* Get a more refined estimate for the number of iterations.  */
1156   widest_int max_back_edges;
1157   if (max_loop_iterations (loop, &max_back_edges))
1158     max_ni = wi::smin (max_ni, max_back_edges + 1);
1159
1160   /* Work out how many bits we need to represent the limit.  */
1161   return wi::min_precision (max_ni * factor, UNSIGNED);
1162 }
1163
1164 /* True if the loop needs peeling or partial vectors when vectorized.  */
1165
1166 static bool
1167 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1168 {
1169   unsigned HOST_WIDE_INT const_vf;
1170   HOST_WIDE_INT max_niter
1171     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1172
1173   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1174   if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1175     th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1176                                           (loop_vinfo));
1177
1178   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1179       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1180     {
1181       /* Work out the (constant) number of iterations that need to be
1182          peeled for reasons other than niters.  */
1183       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1184       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1185         peel_niter += 1;
1186       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1187                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1188         return true;
1189     }
1190   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1191       /* ??? When peeling for gaps but not alignment, we could
1192          try to check whether the (variable) niters is known to be
1193          VF * N + 1.  That's something of a niche case though.  */
1194       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1195       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1196       || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1197            < (unsigned) exact_log2 (const_vf))
1198           /* In case of versioning, check if the maximum number of
1199              iterations is greater than th.  If they are identical,
1200              the epilogue is unnecessary.  */
1201           && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1202               || ((unsigned HOST_WIDE_INT) max_niter
1203                   > (th / const_vf) * const_vf))))
1204     return true;
1205
1206   return false;
1207 }
1208
1209 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1210    whether we can actually generate the masks required.  Return true if so,
1211    storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE.  */
1212
1213 static bool
1214 vect_verify_full_masking (loop_vec_info loop_vinfo)
1215 {
1216   unsigned int min_ni_width;
1217
1218   /* Use a normal loop if there are no statements that need masking.
1219      This only happens in rare degenerate cases: it means that the loop
1220      has no loads, no stores, and no live-out values.  */
1221   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1222     return false;
1223
1224   /* Produce the rgroup controls.  */
1225   for (auto mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1226     {
1227       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1228       tree vectype = mask.first;
1229       unsigned nvectors = mask.second;
1230
1231       if (masks->rgc_vec.length () < nvectors)
1232         masks->rgc_vec.safe_grow_cleared (nvectors, true);
1233       rgroup_controls *rgm = &(*masks).rgc_vec[nvectors - 1];
1234       /* The number of scalars per iteration and the number of vectors are
1235          both compile-time constants.  */
1236       unsigned int nscalars_per_iter
1237           = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1238                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1239
1240       if (rgm->max_nscalars_per_iter < nscalars_per_iter)
1241         {
1242           rgm->max_nscalars_per_iter = nscalars_per_iter;
1243           rgm->type = truth_type_for (vectype);
1244           rgm->factor = 1;
1245         }
1246     }
1247
1248   unsigned int max_nscalars_per_iter
1249     = vect_get_max_nscalars_per_iter (loop_vinfo);
1250
1251   /* Work out how many bits we need to represent the limit.  */
1252   min_ni_width
1253     = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1254
1255   /* Find a scalar mode for which WHILE_ULT is supported.  */
1256   opt_scalar_int_mode cmp_mode_iter;
1257   tree cmp_type = NULL_TREE;
1258   tree iv_type = NULL_TREE;
1259   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1260   unsigned int iv_precision = UINT_MAX;
1261
1262   if (iv_limit != -1)
1263     iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1264                                       UNSIGNED);
1265
1266   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1267     {
1268       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1269       if (cmp_bits >= min_ni_width
1270           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1271         {
1272           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1273           if (this_type
1274               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1275             {
1276               /* Although we could stop as soon as we find a valid mode,
1277                  there are at least two reasons why that's not always the
1278                  best choice:
1279
1280                  - An IV that's Pmode or wider is more likely to be reusable
1281                    in address calculations than an IV that's narrower than
1282                    Pmode.
1283
1284                  - Doing the comparison in IV_PRECISION or wider allows
1285                    a natural 0-based IV, whereas using a narrower comparison
1286                    type requires mitigations against wrap-around.
1287
1288                  Conversely, if the IV limit is variable, doing the comparison
1289                  in a wider type than the original type can introduce
1290                  unnecessary extensions, so picking the widest valid mode
1291                  is not always a good choice either.
1292
1293                  Here we prefer the first IV type that's Pmode or wider,
1294                  and the first comparison type that's IV_PRECISION or wider.
1295                  (The comparison type must be no wider than the IV type,
1296                  to avoid extensions in the vector loop.)
1297
1298                  ??? We might want to try continuing beyond Pmode for ILP32
1299                  targets if CMP_BITS < IV_PRECISION.  */
1300               iv_type = this_type;
1301               if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1302                 cmp_type = this_type;
1303               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1304                 break;
1305             }
1306         }
1307     }
1308
1309   if (!cmp_type)
1310     {
1311       LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.release ();
1312       return false;
1313     }
1314
1315   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1316   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1317   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_while_ult;
1318   return true;
1319 }
1320
1321 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1322    whether we can actually generate AVX512 style masks.  Return true if so,
1323    storing the type of the scalar IV in LOOP_VINFO_RGROUP_IV_TYPE.  */
1324
1325 static bool
1326 vect_verify_full_masking_avx512 (loop_vec_info loop_vinfo)
1327 {
1328   /* Produce differently organized rgc_vec and differently check
1329      we can produce masks.  */
1330
1331   /* Use a normal loop if there are no statements that need masking.
1332      This only happens in rare degenerate cases: it means that the loop
1333      has no loads, no stores, and no live-out values.  */
1334   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1335     return false;
1336
1337   /* For the decrementing IV we need to represent all values in
1338      [0, niter + niter_skip] where niter_skip is the elements we
1339      skip in the first iteration for prologue peeling.  */
1340   tree iv_type = NULL_TREE;
1341   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1342   unsigned int iv_precision = UINT_MAX;
1343   if (iv_limit != -1)
1344     iv_precision = wi::min_precision (iv_limit, UNSIGNED);
1345
1346   /* First compute the type for the IV we use to track the remaining
1347      scalar iterations.  */
1348   opt_scalar_int_mode cmp_mode_iter;
1349   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1350     {
1351       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1352       if (cmp_bits >= iv_precision
1353           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1354         {
1355           iv_type = build_nonstandard_integer_type (cmp_bits, true);
1356           if (iv_type)
1357             break;
1358         }
1359     }
1360   if (!iv_type)
1361     return false;
1362
1363   /* Produce the rgroup controls.  */
1364   for (auto const &mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1365     {
1366       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1367       tree vectype = mask.first;
1368       unsigned nvectors = mask.second;
1369
1370       /* The number of scalars per iteration and the number of vectors are
1371          both compile-time constants.  */
1372       unsigned int nscalars_per_iter
1373         = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1374                      LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1375
1376       /* We index the rgroup_controls vector with nscalars_per_iter
1377          which we keep constant and instead have a varying nvectors,
1378          remembering the vector mask with the fewest nV.  */
1379       if (masks->rgc_vec.length () < nscalars_per_iter)
1380         masks->rgc_vec.safe_grow_cleared (nscalars_per_iter, true);
1381       rgroup_controls *rgm = &(*masks).rgc_vec[nscalars_per_iter - 1];
1382
1383       if (!rgm->type || rgm->factor > nvectors)
1384         {
1385           rgm->type = truth_type_for (vectype);
1386           rgm->compare_type = NULL_TREE;
1387           rgm->max_nscalars_per_iter = nscalars_per_iter;
1388           rgm->factor = nvectors;
1389           rgm->bias_adjusted_ctrl = NULL_TREE;
1390         }
1391     }
1392
1393   /* There is no fixed compare type we are going to use but we have to
1394      be able to get at one for each mask group.  */
1395   unsigned int min_ni_width
1396     = wi::min_precision (vect_max_vf (loop_vinfo), UNSIGNED);
1397
1398   bool ok = true;
1399   for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
1400     {
1401       tree mask_type = rgc.type;
1402       if (!mask_type)
1403         continue;
1404
1405       if (TYPE_PRECISION (TREE_TYPE (mask_type)) != 1)
1406         {
1407           ok = false;
1408           break;
1409         }
1410
1411       /* If iv_type is usable as compare type use that - we can elide the
1412          saturation in that case.   */
1413       if (TYPE_PRECISION (iv_type) >= min_ni_width)
1414         {
1415           tree cmp_vectype
1416             = build_vector_type (iv_type, TYPE_VECTOR_SUBPARTS (mask_type));
1417           if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1418             rgc.compare_type = cmp_vectype;
1419         }
1420       if (!rgc.compare_type)
1421         FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1422           {
1423             unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1424             if (cmp_bits >= min_ni_width
1425                 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1426               {
1427                 tree cmp_type = build_nonstandard_integer_type (cmp_bits, true);
1428                 if (!cmp_type)
1429                   continue;
1430
1431                 /* Check whether we can produce the mask with cmp_type.  */
1432                 tree cmp_vectype
1433                   = build_vector_type (cmp_type, TYPE_VECTOR_SUBPARTS (mask_type));
1434                 if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1435                   {
1436                     rgc.compare_type = cmp_vectype;
1437                     break;
1438                   }
1439               }
1440         }
1441       if (!rgc.compare_type)
1442         {
1443           ok = false;
1444           break;
1445         }
1446     }
1447   if (!ok)
1448     {
1449       release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
1450       return false;
1451     }
1452
1453   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = error_mark_node;
1454   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1455   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_avx512;
1456   return true;
1457 }
1458
1459 /* Check whether we can use vector access with length based on precison
1460    comparison.  So far, to keep it simple, we only allow the case that the
1461    precision of the target supported length is larger than the precision
1462    required by loop niters.  */
1463
1464 static bool
1465 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1466 {
1467   if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1468     return false;
1469
1470   machine_mode len_load_mode, len_store_mode;
1471   if (!get_len_load_store_mode (loop_vinfo->vector_mode, true)
1472          .exists (&len_load_mode))
1473     return false;
1474   if (!get_len_load_store_mode (loop_vinfo->vector_mode, false)
1475          .exists (&len_store_mode))
1476     return false;
1477
1478   signed char partial_load_bias = internal_len_load_store_bias
1479     (IFN_LEN_LOAD, len_load_mode);
1480
1481   signed char partial_store_bias = internal_len_load_store_bias
1482     (IFN_LEN_STORE, len_store_mode);
1483
1484   gcc_assert (partial_load_bias == partial_store_bias);
1485
1486   if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1487     return false;
1488
1489   /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1490      len_loads with a length of zero.  In order to avoid that we prohibit
1491      more than one loop length here.  */
1492   if (partial_load_bias == -1
1493       && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1494     return false;
1495
1496   LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1497
1498   unsigned int max_nitems_per_iter = 1;
1499   unsigned int i;
1500   rgroup_controls *rgl;
1501   /* Find the maximum number of items per iteration for every rgroup.  */
1502   FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1503     {
1504       unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1505       max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1506     }
1507
1508   /* Work out how many bits we need to represent the length limit.  */
1509   unsigned int min_ni_prec
1510     = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1511
1512   /* Now use the maximum of below precisions for one suitable IV type:
1513      - the IV's natural precision
1514      - the precision needed to hold: the maximum number of scalar
1515        iterations multiplied by the scale factor (min_ni_prec above)
1516      - the Pmode precision
1517
1518      If min_ni_prec is less than the precision of the current niters,
1519      we perfer to still use the niters type.  Prefer to use Pmode and
1520      wider IV to avoid narrow conversions.  */
1521
1522   unsigned int ni_prec
1523     = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1524   min_ni_prec = MAX (min_ni_prec, ni_prec);
1525   min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1526
1527   tree iv_type = NULL_TREE;
1528   opt_scalar_int_mode tmode_iter;
1529   FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1530     {
1531       scalar_mode tmode = tmode_iter.require ();
1532       unsigned int tbits = GET_MODE_BITSIZE (tmode);
1533
1534       /* ??? Do we really want to construct one IV whose precision exceeds
1535          BITS_PER_WORD?  */
1536       if (tbits > BITS_PER_WORD)
1537         break;
1538
1539       /* Find the first available standard integral type.  */
1540       if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1541         {
1542           iv_type = build_nonstandard_integer_type (tbits, true);
1543           break;
1544         }
1545     }
1546
1547   if (!iv_type)
1548     {
1549       if (dump_enabled_p ())
1550         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1551                          "can't vectorize with length-based partial vectors"
1552                          " because there is no suitable iv type.\n");
1553       return false;
1554     }
1555
1556   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1557   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1558   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_len;
1559
1560   return true;
1561 }
1562
1563 /* Calculate the cost of one scalar iteration of the loop.  */
1564 static void
1565 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1566 {
1567   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1568   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1569   int nbbs = loop->num_nodes, factor;
1570   int innerloop_iters, i;
1571
1572   DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1573
1574   /* Gather costs for statements in the scalar loop.  */
1575
1576   /* FORNOW.  */
1577   innerloop_iters = 1;
1578   if (loop->inner)
1579     innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1580
1581   for (i = 0; i < nbbs; i++)
1582     {
1583       gimple_stmt_iterator si;
1584       basic_block bb = bbs[i];
1585
1586       if (bb->loop_father == loop->inner)
1587         factor = innerloop_iters;
1588       else
1589         factor = 1;
1590
1591       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1592         {
1593           gimple *stmt = gsi_stmt (si);
1594           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1595
1596           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1597             continue;
1598
1599           /* Skip stmts that are not vectorized inside the loop.  */
1600           stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1601           if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1602               && (!STMT_VINFO_LIVE_P (vstmt_info)
1603                   || !VECTORIZABLE_CYCLE_DEF
1604                         (STMT_VINFO_DEF_TYPE (vstmt_info))))
1605             continue;
1606
1607           vect_cost_for_stmt kind;
1608           if (STMT_VINFO_DATA_REF (stmt_info))
1609             {
1610               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1611                kind = scalar_load;
1612              else
1613                kind = scalar_store;
1614             }
1615           else if (vect_nop_conversion_p (stmt_info))
1616             continue;
1617           else
1618             kind = scalar_stmt;
1619
1620           /* We are using vect_prologue here to avoid scaling twice
1621              by the inner loop factor.  */
1622           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1623                             factor, kind, stmt_info, 0, vect_prologue);
1624         }
1625     }
1626
1627   /* Now accumulate cost.  */
1628   loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1629   add_stmt_costs (loop_vinfo->scalar_costs,
1630                   &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1631   loop_vinfo->scalar_costs->finish_cost (nullptr);
1632 }
1633
1634
1635 /* Function vect_analyze_loop_form.
1636
1637    Verify that certain CFG restrictions hold, including:
1638    - the loop has a pre-header
1639    - the loop has a single entry and exit
1640    - the loop exit condition is simple enough
1641    - the number of iterations can be analyzed, i.e, a countable loop.  The
1642      niter could be analyzed under some assumptions.  */
1643
1644 opt_result
1645 vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info)
1646 {
1647   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1648
1649   /* Different restrictions apply when we are considering an inner-most loop,
1650      vs. an outer (nested) loop.
1651      (FORNOW. May want to relax some of these restrictions in the future).  */
1652
1653   info->inner_loop_cond = NULL;
1654   if (!loop->inner)
1655     {
1656       /* Inner-most loop.  We currently require that the number of BBs is
1657          exactly 2 (the header and latch).  Vectorizable inner-most loops
1658          look like this:
1659
1660                         (pre-header)
1661                            |
1662                           header <--------+
1663                            | |            |
1664                            | +--> latch --+
1665                            |
1666                         (exit-bb)  */
1667
1668       if (loop->num_nodes != 2)
1669         return opt_result::failure_at (vect_location,
1670                                        "not vectorized:"
1671                                        " control flow in loop.\n");
1672
1673       if (empty_block_p (loop->header))
1674         return opt_result::failure_at (vect_location,
1675                                        "not vectorized: empty loop.\n");
1676     }
1677   else
1678     {
1679       class loop *innerloop = loop->inner;
1680       edge entryedge;
1681
1682       /* Nested loop. We currently require that the loop is doubly-nested,
1683          contains a single inner loop, and the number of BBs is exactly 5.
1684          Vectorizable outer-loops look like this:
1685
1686                         (pre-header)
1687                            |
1688                           header <---+
1689                            |         |
1690                           inner-loop |
1691                            |         |
1692                           tail ------+
1693                            |
1694                         (exit-bb)
1695
1696          The inner-loop has the properties expected of inner-most loops
1697          as described above.  */
1698
1699       if ((loop->inner)->inner || (loop->inner)->next)
1700         return opt_result::failure_at (vect_location,
1701                                        "not vectorized:"
1702                                        " multiple nested loops.\n");
1703
1704       if (loop->num_nodes != 5)
1705         return opt_result::failure_at (vect_location,
1706                                        "not vectorized:"
1707                                        " control flow in loop.\n");
1708
1709       entryedge = loop_preheader_edge (innerloop);
1710       if (entryedge->src != loop->header
1711           || !single_exit (innerloop)
1712           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1713         return opt_result::failure_at (vect_location,
1714                                        "not vectorized:"
1715                                        " unsupported outerloop form.\n");
1716
1717       /* Analyze the inner-loop.  */
1718       vect_loop_form_info inner;
1719       opt_result res = vect_analyze_loop_form (loop->inner, &inner);
1720       if (!res)
1721         {
1722           if (dump_enabled_p ())
1723             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1724                              "not vectorized: Bad inner loop.\n");
1725           return res;
1726         }
1727
1728       /* Don't support analyzing niter under assumptions for inner
1729          loop.  */
1730       if (!integer_onep (inner.assumptions))
1731         return opt_result::failure_at (vect_location,
1732                                        "not vectorized: Bad inner loop.\n");
1733
1734       if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1735         return opt_result::failure_at (vect_location,
1736                                        "not vectorized: inner-loop count not"
1737                                        " invariant.\n");
1738
1739       if (dump_enabled_p ())
1740         dump_printf_loc (MSG_NOTE, vect_location,
1741                          "Considering outer-loop vectorization.\n");
1742       info->inner_loop_cond = inner.loop_cond;
1743     }
1744
1745   if (!single_exit (loop))
1746     return opt_result::failure_at (vect_location,
1747                                    "not vectorized: multiple exits.\n");
1748   if (EDGE_COUNT (loop->header->preds) != 2)
1749     return opt_result::failure_at (vect_location,
1750                                    "not vectorized:"
1751                                    " too many incoming edges.\n");
1752
1753   /* We assume that the loop exit condition is at the end of the loop. i.e,
1754      that the loop is represented as a do-while (with a proper if-guard
1755      before the loop if needed), where the loop header contains all the
1756      executable statements, and the latch is empty.  */
1757   if (!empty_block_p (loop->latch)
1758       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1759     return opt_result::failure_at (vect_location,
1760                                    "not vectorized: latch block not empty.\n");
1761
1762   /* Make sure the exit is not abnormal.  */
1763   edge e = single_exit (loop);
1764   if (e->flags & EDGE_ABNORMAL)
1765     return opt_result::failure_at (vect_location,
1766                                    "not vectorized:"
1767                                    " abnormal loop exit edge.\n");
1768
1769   info->loop_cond
1770     = vect_get_loop_niters (loop, &info->assumptions,
1771                             &info->number_of_iterations,
1772                             &info->number_of_iterationsm1);
1773   if (!info->loop_cond)
1774     return opt_result::failure_at
1775       (vect_location,
1776        "not vectorized: complicated exit condition.\n");
1777
1778   if (integer_zerop (info->assumptions)
1779       || !info->number_of_iterations
1780       || chrec_contains_undetermined (info->number_of_iterations))
1781     return opt_result::failure_at
1782       (info->loop_cond,
1783        "not vectorized: number of iterations cannot be computed.\n");
1784
1785   if (integer_zerop (info->number_of_iterations))
1786     return opt_result::failure_at
1787       (info->loop_cond,
1788        "not vectorized: number of iterations = 0.\n");
1789
1790   if (!(tree_fits_shwi_p (info->number_of_iterations)
1791         && tree_to_shwi (info->number_of_iterations) > 0))
1792     {
1793       if (dump_enabled_p ())
1794         {
1795           dump_printf_loc (MSG_NOTE, vect_location,
1796                            "Symbolic number of iterations is ");
1797           dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1798           dump_printf (MSG_NOTE, "\n");
1799         }
1800     }
1801
1802   return opt_result::success ();
1803 }
1804
1805 /* Create a loop_vec_info for LOOP with SHARED and the
1806    vect_analyze_loop_form result.  */
1807
1808 loop_vec_info
1809 vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1810                         const vect_loop_form_info *info,
1811                         loop_vec_info main_loop_info)
1812 {
1813   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1814   LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1815   LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1816   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1817   LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = main_loop_info;
1818   /* Also record the assumptions for versioning.  */
1819   if (!integer_onep (info->assumptions) && !main_loop_info)
1820     LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1821
1822   stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (info->loop_cond);
1823   STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1824   if (info->inner_loop_cond)
1825     {
1826       stmt_vec_info inner_loop_cond_info
1827         = loop_vinfo->lookup_stmt (info->inner_loop_cond);
1828       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1829       /* If we have an estimate on the number of iterations of the inner
1830          loop use that to limit the scale for costing, otherwise use
1831          --param vect-inner-loop-cost-factor literally.  */
1832       widest_int nit;
1833       if (estimated_stmt_executions (loop->inner, &nit))
1834         LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1835           = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1836     }
1837
1838   return loop_vinfo;
1839 }
1840
1841
1842
1843 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1844    statements update the vectorization factor.  */
1845
1846 static void
1847 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1848 {
1849   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1850   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1851   int nbbs = loop->num_nodes;
1852   poly_uint64 vectorization_factor;
1853   int i;
1854
1855   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1856
1857   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1858   gcc_assert (known_ne (vectorization_factor, 0U));
1859
1860   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1861      vectorization factor of the loop is the unrolling factor required by
1862      the SLP instances.  If that unrolling factor is 1, we say, that we
1863      perform pure SLP on loop - cross iteration parallelism is not
1864      exploited.  */
1865   bool only_slp_in_loop = true;
1866   for (i = 0; i < nbbs; i++)
1867     {
1868       basic_block bb = bbs[i];
1869       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1870            gsi_next (&si))
1871         {
1872           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1873           if (!stmt_info)
1874             continue;
1875           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1876                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1877               && !PURE_SLP_STMT (stmt_info))
1878             /* STMT needs both SLP and loop-based vectorization.  */
1879             only_slp_in_loop = false;
1880         }
1881       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1882            gsi_next (&si))
1883         {
1884           if (is_gimple_debug (gsi_stmt (si)))
1885             continue;
1886           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1887           stmt_info = vect_stmt_to_vectorize (stmt_info);
1888           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1889                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1890               && !PURE_SLP_STMT (stmt_info))
1891             /* STMT needs both SLP and loop-based vectorization.  */
1892             only_slp_in_loop = false;
1893         }
1894     }
1895
1896   if (only_slp_in_loop)
1897     {
1898       if (dump_enabled_p ())
1899         dump_printf_loc (MSG_NOTE, vect_location,
1900                          "Loop contains only SLP stmts\n");
1901       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1902     }
1903   else
1904     {
1905       if (dump_enabled_p ())
1906         dump_printf_loc (MSG_NOTE, vect_location,
1907                          "Loop contains SLP and non-SLP stmts\n");
1908       /* Both the vectorization factor and unroll factor have the form
1909          GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1910          so they must have a common multiple.  */
1911       vectorization_factor
1912         = force_common_multiple (vectorization_factor,
1913                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1914     }
1915
1916   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1917   if (dump_enabled_p ())
1918     {
1919       dump_printf_loc (MSG_NOTE, vect_location,
1920                        "Updating vectorization factor to ");
1921       dump_dec (MSG_NOTE, vectorization_factor);
1922       dump_printf (MSG_NOTE, ".\n");
1923     }
1924 }
1925
1926 /* Return true if STMT_INFO describes a double reduction phi and if
1927    the other phi in the reduction is also relevant for vectorization.
1928    This rejects cases such as:
1929
1930       outer1:
1931         x_1 = PHI <x_3(outer2), ...>;
1932         ...
1933
1934       inner:
1935         x_2 = ...;
1936         ...
1937
1938       outer2:
1939         x_3 = PHI <x_2(inner)>;
1940
1941    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1942
1943 static bool
1944 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1945 {
1946   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1947     return false;
1948
1949   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1950 }
1951
1952 /* Function vect_analyze_loop_operations.
1953
1954    Scan the loop stmts and make sure they are all vectorizable.  */
1955
1956 static opt_result
1957 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1958 {
1959   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1960   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1961   int nbbs = loop->num_nodes;
1962   int i;
1963   stmt_vec_info stmt_info;
1964   bool need_to_vectorize = false;
1965   bool ok;
1966
1967   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1968
1969   auto_vec<stmt_info_for_cost> cost_vec;
1970
1971   for (i = 0; i < nbbs; i++)
1972     {
1973       basic_block bb = bbs[i];
1974
1975       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1976            gsi_next (&si))
1977         {
1978           gphi *phi = si.phi ();
1979           ok = true;
1980
1981           stmt_info = loop_vinfo->lookup_stmt (phi);
1982           if (dump_enabled_p ())
1983             dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G",
1984                              (gimple *) phi);
1985           if (virtual_operand_p (gimple_phi_result (phi)))
1986             continue;
1987
1988           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1989              (i.e., a phi in the tail of the outer-loop).  */
1990           if (! is_loop_header_bb_p (bb))
1991             {
1992               /* FORNOW: we currently don't support the case that these phis
1993                  are not used in the outerloop (unless it is double reduction,
1994                  i.e., this phi is vect_reduction_def), cause this case
1995                  requires to actually do something here.  */
1996               if (STMT_VINFO_LIVE_P (stmt_info)
1997                   && !vect_active_double_reduction_p (stmt_info))
1998                 return opt_result::failure_at (phi,
1999                                                "Unsupported loop-closed phi"
2000                                                " in outer-loop.\n");
2001
2002               /* If PHI is used in the outer loop, we check that its operand
2003                  is defined in the inner loop.  */
2004               if (STMT_VINFO_RELEVANT_P (stmt_info))
2005                 {
2006                   tree phi_op;
2007
2008                   if (gimple_phi_num_args (phi) != 1)
2009                     return opt_result::failure_at (phi, "unsupported phi");
2010
2011                   phi_op = PHI_ARG_DEF (phi, 0);
2012                   stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
2013                   if (!op_def_info)
2014                     return opt_result::failure_at (phi, "unsupported phi\n");
2015
2016                   if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
2017                       && (STMT_VINFO_RELEVANT (op_def_info)
2018                           != vect_used_in_outer_by_reduction))
2019                     return opt_result::failure_at (phi, "unsupported phi\n");
2020
2021                   if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2022                        || (STMT_VINFO_DEF_TYPE (stmt_info)
2023                            == vect_double_reduction_def))
2024                       && !vectorizable_lc_phi (loop_vinfo,
2025                                                stmt_info, NULL, NULL))
2026                     return opt_result::failure_at (phi, "unsupported phi\n");
2027                 }
2028
2029               continue;
2030             }
2031
2032           gcc_assert (stmt_info);
2033
2034           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
2035                || STMT_VINFO_LIVE_P (stmt_info))
2036               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def
2037               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
2038             /* A scalar-dependence cycle that we don't support.  */
2039             return opt_result::failure_at (phi,
2040                                            "not vectorized:"
2041                                            " scalar dependence cycle.\n");
2042
2043           if (STMT_VINFO_RELEVANT_P (stmt_info))
2044             {
2045               need_to_vectorize = true;
2046               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
2047                   && ! PURE_SLP_STMT (stmt_info))
2048                 ok = vectorizable_induction (loop_vinfo,
2049                                              stmt_info, NULL, NULL,
2050                                              &cost_vec);
2051               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2052                         || (STMT_VINFO_DEF_TYPE (stmt_info)
2053                             == vect_double_reduction_def)
2054                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
2055                        && ! PURE_SLP_STMT (stmt_info))
2056                 ok = vectorizable_reduction (loop_vinfo,
2057                                              stmt_info, NULL, NULL, &cost_vec);
2058               else if ((STMT_VINFO_DEF_TYPE (stmt_info)
2059                         == vect_first_order_recurrence)
2060                        && ! PURE_SLP_STMT (stmt_info))
2061                 ok = vectorizable_recurr (loop_vinfo, stmt_info, NULL, NULL,
2062                                            &cost_vec);
2063             }
2064
2065           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
2066           if (ok
2067               && STMT_VINFO_LIVE_P (stmt_info)
2068               && !PURE_SLP_STMT (stmt_info))
2069             ok = vectorizable_live_operation (loop_vinfo, stmt_info, NULL, NULL,
2070                                               -1, false, &cost_vec);
2071
2072           if (!ok)
2073             return opt_result::failure_at (phi,
2074                                            "not vectorized: relevant phi not "
2075                                            "supported: %G",
2076                                            static_cast <gimple *> (phi));
2077         }
2078
2079       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
2080            gsi_next (&si))
2081         {
2082           gimple *stmt = gsi_stmt (si);
2083           if (!gimple_clobber_p (stmt)
2084               && !is_gimple_debug (stmt))
2085             {
2086               opt_result res
2087                 = vect_analyze_stmt (loop_vinfo,
2088                                      loop_vinfo->lookup_stmt (stmt),
2089                                      &need_to_vectorize,
2090                                      NULL, NULL, &cost_vec);
2091               if (!res)
2092                 return res;
2093             }
2094         }
2095     } /* bbs */
2096
2097   add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
2098
2099   /* All operations in the loop are either irrelevant (deal with loop
2100      control, or dead), or only used outside the loop and can be moved
2101      out of the loop (e.g. invariants, inductions).  The loop can be
2102      optimized away by scalar optimizations.  We're better off not
2103      touching this loop.  */
2104   if (!need_to_vectorize)
2105     {
2106       if (dump_enabled_p ())
2107         dump_printf_loc (MSG_NOTE, vect_location,
2108                          "All the computation can be taken out of the loop.\n");
2109       return opt_result::failure_at
2110         (vect_location,
2111          "not vectorized: redundant loop. no profit to vectorize.\n");
2112     }
2113
2114   return opt_result::success ();
2115 }
2116
2117 /* Return true if we know that the iteration count is smaller than the
2118    vectorization factor.  Return false if it isn't, or if we can't be sure
2119    either way.  */
2120
2121 static bool
2122 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
2123 {
2124   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2125
2126   HOST_WIDE_INT max_niter;
2127   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2128     max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
2129   else
2130     max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2131
2132   if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
2133     return true;
2134
2135   return false;
2136 }
2137
2138 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
2139    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
2140    definitely no, or -1 if it's worth retrying.  */
2141
2142 static int
2143 vect_analyze_loop_costing (loop_vec_info loop_vinfo,
2144                            unsigned *suggested_unroll_factor)
2145 {
2146   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2147   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2148
2149   /* Only loops that can handle partially-populated vectors can have iteration
2150      counts less than the vectorization factor.  */
2151   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2152       && vect_known_niters_smaller_than_vf (loop_vinfo))
2153     {
2154       if (dump_enabled_p ())
2155         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2156                          "not vectorized: iteration count smaller than "
2157                          "vectorization factor.\n");
2158       return 0;
2159     }
2160
2161   /* If we know the number of iterations we can do better, for the
2162      epilogue we can also decide whether the main loop leaves us
2163      with enough iterations, prefering a smaller vector epilog then
2164      also possibly used for the case we skip the vector loop.  */
2165   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2166     {
2167       widest_int scalar_niters
2168         = wi::to_widest (LOOP_VINFO_NITERSM1 (loop_vinfo)) + 1;
2169       if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2170         {
2171           loop_vec_info orig_loop_vinfo
2172             = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2173           unsigned lowest_vf
2174             = constant_lower_bound (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo));
2175           int prolog_peeling = 0;
2176           if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2177             prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo);
2178           if (prolog_peeling >= 0
2179               && known_eq (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
2180                            lowest_vf))
2181             {
2182               unsigned gap
2183                 = LOOP_VINFO_PEELING_FOR_GAPS (orig_loop_vinfo) ? 1 : 0;
2184               scalar_niters = ((scalar_niters - gap - prolog_peeling)
2185                                % lowest_vf + gap);
2186             }
2187         }
2188       /* Reject vectorizing for a single scalar iteration, even if
2189          we could in principle implement that using partial vectors.  */
2190       unsigned peeling_gap = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2191       if (scalar_niters <= peeling_gap + 1)
2192         {
2193           if (dump_enabled_p ())
2194             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2195                              "not vectorized: loop only has a single "
2196                              "scalar iteration.\n");
2197           return 0;
2198         }
2199
2200       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2201         {
2202           /* Check that the loop processes at least one full vector.  */
2203           poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2204           if (known_lt (scalar_niters, vf))
2205             {
2206               if (dump_enabled_p ())
2207                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2208                                  "loop does not have enough iterations "
2209                                  "to support vectorization.\n");
2210               return 0;
2211             }
2212
2213           /* If we need to peel an extra epilogue iteration to handle data
2214              accesses with gaps, check that there are enough scalar iterations
2215              available.
2216
2217              The check above is redundant with this one when peeling for gaps,
2218              but the distinction is useful for diagnostics.  */
2219           if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2220               && known_le (scalar_niters, vf))
2221             {
2222               if (dump_enabled_p ())
2223                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2224                                  "loop does not have enough iterations "
2225                                  "to support peeling for gaps.\n");
2226               return 0;
2227             }
2228         }
2229     }
2230
2231   /* If using the "very cheap" model. reject cases in which we'd keep
2232      a copy of the scalar code (even if we might be able to vectorize it).  */
2233   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2234       && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2235           || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2236           || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
2237     {
2238       if (dump_enabled_p ())
2239         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2240                          "some scalar iterations would need to be peeled\n");
2241       return 0;
2242     }
2243
2244   int min_profitable_iters, min_profitable_estimate;
2245   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2246                                       &min_profitable_estimate,
2247                                       suggested_unroll_factor);
2248
2249   if (min_profitable_iters < 0)
2250     {
2251       if (dump_enabled_p ())
2252         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2253                          "not vectorized: vectorization not profitable.\n");
2254       if (dump_enabled_p ())
2255         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2256                          "not vectorized: vector version will never be "
2257                          "profitable.\n");
2258       return -1;
2259     }
2260
2261   int min_scalar_loop_bound = (param_min_vect_loop_bound
2262                                * assumed_vf);
2263
2264   /* Use the cost model only if it is more conservative than user specified
2265      threshold.  */
2266   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
2267                                     min_profitable_iters);
2268
2269   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2270
2271   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2272       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2273     {
2274       if (dump_enabled_p ())
2275         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2276                          "not vectorized: vectorization not profitable.\n");
2277       if (dump_enabled_p ())
2278         dump_printf_loc (MSG_NOTE, vect_location,
2279                          "not vectorized: iteration count smaller than user "
2280                          "specified loop bound parameter or minimum profitable "
2281                          "iterations (whichever is more conservative).\n");
2282       return 0;
2283     }
2284
2285   /* The static profitablity threshold min_profitable_estimate includes
2286      the cost of having to check at runtime whether the scalar loop
2287      should be used instead.  If it turns out that we don't need or want
2288      such a check, the threshold we should use for the static estimate
2289      is simply the point at which the vector loop becomes more profitable
2290      than the scalar loop.  */
2291   if (min_profitable_estimate > min_profitable_iters
2292       && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
2293       && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2294       && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2295       && !vect_apply_runtime_profitability_check_p (loop_vinfo))
2296     {
2297       if (dump_enabled_p ())
2298         dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
2299                          " choice between the scalar and vector loops\n");
2300       min_profitable_estimate = min_profitable_iters;
2301     }
2302
2303   /* If the vector loop needs multiple iterations to be beneficial then
2304      things are probably too close to call, and the conservative thing
2305      would be to stick with the scalar code.  */
2306   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2307       && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
2308     {
2309       if (dump_enabled_p ())
2310         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2311                          "one iteration of the vector loop would be"
2312                          " more expensive than the equivalent number of"
2313                          " iterations of the scalar loop\n");
2314       return 0;
2315     }
2316
2317   HOST_WIDE_INT estimated_niter;
2318
2319   /* If we are vectorizing an epilogue then we know the maximum number of
2320      scalar iterations it will cover is at least one lower than the
2321      vectorization factor of the main loop.  */
2322   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2323     estimated_niter
2324       = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
2325   else
2326     {
2327       estimated_niter = estimated_stmt_executions_int (loop);
2328       if (estimated_niter == -1)
2329         estimated_niter = likely_max_stmt_executions_int (loop);
2330     }
2331   if (estimated_niter != -1
2332       && ((unsigned HOST_WIDE_INT) estimated_niter
2333           < MAX (th, (unsigned) min_profitable_estimate)))
2334     {
2335       if (dump_enabled_p ())
2336         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2337                          "not vectorized: estimated iteration count too "
2338                          "small.\n");
2339       if (dump_enabled_p ())
2340         dump_printf_loc (MSG_NOTE, vect_location,
2341                          "not vectorized: estimated iteration count smaller "
2342                          "than specified loop bound parameter or minimum "
2343                          "profitable iterations (whichever is more "
2344                          "conservative).\n");
2345       return -1;
2346     }
2347
2348   return 1;
2349 }
2350
2351 static opt_result
2352 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
2353                            vec<data_reference_p> *datarefs,
2354                            unsigned int *n_stmts)
2355 {
2356   *n_stmts = 0;
2357   for (unsigned i = 0; i < loop->num_nodes; i++)
2358     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2359          !gsi_end_p (gsi); gsi_next (&gsi))
2360       {
2361         gimple *stmt = gsi_stmt (gsi);
2362         if (is_gimple_debug (stmt))
2363           continue;
2364         ++(*n_stmts);
2365         opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
2366                                                         NULL, 0);
2367         if (!res)
2368           {
2369             if (is_gimple_call (stmt) && loop->safelen)
2370               {
2371                 tree fndecl = gimple_call_fndecl (stmt), op;
2372                 if (fndecl == NULL_TREE
2373                     && gimple_call_internal_p (stmt, IFN_MASK_CALL))
2374                   {
2375                     fndecl = gimple_call_arg (stmt, 0);
2376                     gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
2377                     fndecl = TREE_OPERAND (fndecl, 0);
2378                     gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
2379                   }
2380                 if (fndecl != NULL_TREE)
2381                   {
2382                     cgraph_node *node = cgraph_node::get (fndecl);
2383                     if (node != NULL && node->simd_clones != NULL)
2384                       {
2385                         unsigned int j, n = gimple_call_num_args (stmt);
2386                         for (j = 0; j < n; j++)
2387                           {
2388                             op = gimple_call_arg (stmt, j);
2389                             if (DECL_P (op)
2390                                 || (REFERENCE_CLASS_P (op)
2391                                     && get_base_address (op)))
2392                               break;
2393                           }
2394                         op = gimple_call_lhs (stmt);
2395                         /* Ignore #pragma omp declare simd functions
2396                            if they don't have data references in the
2397                            call stmt itself.  */
2398                         if (j == n
2399                             && !(op
2400                                  && (DECL_P (op)
2401                                      || (REFERENCE_CLASS_P (op)
2402                                          && get_base_address (op)))))
2403                           continue;
2404                       }
2405                   }
2406               }
2407             return res;
2408           }
2409         /* If dependence analysis will give up due to the limit on the
2410            number of datarefs stop here and fail fatally.  */
2411         if (datarefs->length ()
2412             > (unsigned)param_loop_max_datarefs_for_datadeps)
2413           return opt_result::failure_at (stmt, "exceeded param "
2414                                          "loop-max-datarefs-for-datadeps\n");
2415       }
2416   return opt_result::success ();
2417 }
2418
2419 /* Look for SLP-only access groups and turn each individual access into its own
2420    group.  */
2421 static void
2422 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2423 {
2424   unsigned int i;
2425   struct data_reference *dr;
2426
2427   DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2428
2429   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2430   FOR_EACH_VEC_ELT (datarefs, i, dr)
2431     {
2432       gcc_assert (DR_REF (dr));
2433       stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
2434
2435       /* Check if the load is a part of an interleaving chain.  */
2436       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2437         {
2438           stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2439           dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2440           unsigned int group_size = DR_GROUP_SIZE (first_element);
2441
2442           /* Check if SLP-only groups.  */
2443           if (!STMT_SLP_TYPE (stmt_info)
2444               && STMT_VINFO_SLP_VECT_ONLY (first_element))
2445             {
2446               /* Dissolve the group.  */
2447               STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2448
2449               stmt_vec_info vinfo = first_element;
2450               while (vinfo)
2451                 {
2452                   stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2453                   DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2454                   DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2455                   DR_GROUP_SIZE (vinfo) = 1;
2456                   if (STMT_VINFO_STRIDED_P (first_element)
2457                       /* We cannot handle stores with gaps.  */
2458                       || DR_IS_WRITE (dr_info->dr))
2459                     {
2460                       STMT_VINFO_STRIDED_P (vinfo) = true;
2461                       DR_GROUP_GAP (vinfo) = 0;
2462                     }
2463                   else
2464                     DR_GROUP_GAP (vinfo) = group_size - 1;
2465                   /* Duplicate and adjust alignment info, it needs to
2466                      be present on each group leader, see dr_misalignment.  */
2467                   if (vinfo != first_element)
2468                     {
2469                       dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2470                       dr_info2->target_alignment = dr_info->target_alignment;
2471                       int misalignment = dr_info->misalignment;
2472                       if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2473                         {
2474                           HOST_WIDE_INT diff
2475                             = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2476                                - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2477                           unsigned HOST_WIDE_INT align_c
2478                             = dr_info->target_alignment.to_constant ();
2479                           misalignment = (misalignment + diff) % align_c;
2480                         }
2481                       dr_info2->misalignment = misalignment;
2482                     }
2483                   vinfo = next;
2484                 }
2485             }
2486         }
2487     }
2488 }
2489
2490 /* Determine if operating on full vectors for LOOP_VINFO might leave
2491    some scalar iterations still to do.  If so, decide how we should
2492    handle those scalar iterations.  The possibilities are:
2493
2494    (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2495        In this case:
2496
2497          LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2498          LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2499          LOOP_VINFO_PEELING_FOR_NITER == false
2500
2501    (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2502        to handle the remaining scalar iterations.  In this case:
2503
2504          LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2505          LOOP_VINFO_PEELING_FOR_NITER == true
2506
2507        There are two choices:
2508
2509        (2a) Consider vectorizing the epilogue loop at the same VF as the
2510             main loop, but using partial vectors instead of full vectors.
2511             In this case:
2512
2513               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2514
2515        (2b) Consider vectorizing the epilogue loop at lower VFs only.
2516             In this case:
2517
2518               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2519  */
2520
2521 opt_result
2522 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo)
2523 {
2524   /* Determine whether there would be any scalar iterations left over.  */
2525   bool need_peeling_or_partial_vectors_p
2526     = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2527
2528   /* Decide whether to vectorize the loop with partial vectors.  */
2529   LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2530   LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2531   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2532       && need_peeling_or_partial_vectors_p)
2533     {
2534       /* For partial-vector-usage=1, try to push the handling of partial
2535          vectors to the epilogue, with the main loop continuing to operate
2536          on full vectors.
2537
2538          If we are unrolling we also do not want to use partial vectors. This
2539          is to avoid the overhead of generating multiple masks and also to
2540          avoid having to execute entire iterations of FALSE masked instructions
2541          when dealing with one or less full iterations.
2542
2543          ??? We could then end up failing to use partial vectors if we
2544          decide to peel iterations into a prologue, and if the main loop
2545          then ends up processing fewer than VF iterations.  */
2546       if ((param_vect_partial_vector_usage == 1
2547            || loop_vinfo->suggested_unroll_factor > 1)
2548           && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2549           && !vect_known_niters_smaller_than_vf (loop_vinfo))
2550         LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2551       else
2552         LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2553     }
2554
2555   if (dump_enabled_p ())
2556     dump_printf_loc (MSG_NOTE, vect_location,
2557                      "operating on %s vectors%s.\n",
2558                      LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2559                      ? "partial" : "full",
2560                      LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2561                      ? " for epilogue loop" : "");
2562
2563   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2564     = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2565        && need_peeling_or_partial_vectors_p);
2566
2567   return opt_result::success ();
2568 }
2569
2570 /* Function vect_analyze_loop_2.
2571
2572    Apply a set of analyses on LOOP specified by LOOP_VINFO, the different
2573    analyses will record information in some members of LOOP_VINFO.  FATAL
2574    indicates if some analysis meets fatal error.  If one non-NULL pointer
2575    SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one
2576    worked out suggested unroll factor, while one NULL pointer shows it's
2577    going to apply the suggested unroll factor.  SLP_DONE_FOR_SUGGESTED_UF
2578    is to hold the slp decision when the suggested unroll factor is worked
2579    out.  */
2580 static opt_result
2581 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
2582                      unsigned *suggested_unroll_factor,
2583                      bool& slp_done_for_suggested_uf)
2584 {
2585   opt_result ok = opt_result::success ();
2586   int res;
2587   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2588   poly_uint64 min_vf = 2;
2589   loop_vec_info orig_loop_vinfo = NULL;
2590
2591   /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2592      loop_vec_info of the first vectorized loop.  */
2593   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2594     orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2595   else
2596     orig_loop_vinfo = loop_vinfo;
2597   gcc_assert (orig_loop_vinfo);
2598
2599   /* The first group of checks is independent of the vector size.  */
2600   fatal = true;
2601
2602   if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2603       && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2604     return opt_result::failure_at (vect_location,
2605                                    "not vectorized: simd if(0)\n");
2606
2607   /* Find all data references in the loop (which correspond to vdefs/vuses)
2608      and analyze their evolution in the loop.  */
2609
2610   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2611
2612   /* Gather the data references and count stmts in the loop.  */
2613   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2614     {
2615       opt_result res
2616         = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2617                                      &LOOP_VINFO_DATAREFS (loop_vinfo),
2618                                      &LOOP_VINFO_N_STMTS (loop_vinfo));
2619       if (!res)
2620         {
2621           if (dump_enabled_p ())
2622             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2623                              "not vectorized: loop contains function "
2624                              "calls or data references that cannot "
2625                              "be analyzed\n");
2626           return res;
2627         }
2628       loop_vinfo->shared->save_datarefs ();
2629     }
2630   else
2631     loop_vinfo->shared->check_datarefs ();
2632
2633   /* Analyze the data references and also adjust the minimal
2634      vectorization factor according to the loads and stores.  */
2635
2636   ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2637   if (!ok)
2638     {
2639       if (dump_enabled_p ())
2640         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2641                          "bad data references.\n");
2642       return ok;
2643     }
2644
2645   /* Check if we are applying unroll factor now.  */
2646   bool applying_suggested_uf = loop_vinfo->suggested_unroll_factor > 1;
2647   gcc_assert (!applying_suggested_uf || !suggested_unroll_factor);
2648
2649   /* If the slp decision is false when suggested unroll factor is worked
2650      out, and we are applying suggested unroll factor, we can simply skip
2651      all slp related analyses this time.  */
2652   bool slp = !applying_suggested_uf || slp_done_for_suggested_uf;
2653
2654   /* Classify all cross-iteration scalar data-flow cycles.
2655      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
2656   vect_analyze_scalar_cycles (loop_vinfo, slp);
2657
2658   vect_pattern_recog (loop_vinfo);
2659
2660   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2661
2662   /* Analyze the access patterns of the data-refs in the loop (consecutive,
2663      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
2664
2665   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2666   if (!ok)
2667     {
2668       if (dump_enabled_p ())
2669         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2670                          "bad data access.\n");
2671       return ok;
2672     }
2673
2674   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
2675
2676   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2677   if (!ok)
2678     {
2679       if (dump_enabled_p ())
2680         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2681                          "unexpected pattern.\n");
2682       return ok;
2683     }
2684
2685   /* While the rest of the analysis below depends on it in some way.  */
2686   fatal = false;
2687
2688   /* Analyze data dependences between the data-refs in the loop
2689      and adjust the maximum vectorization factor according to
2690      the dependences.
2691      FORNOW: fail at the first data dependence that we encounter.  */
2692
2693   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2694   if (!ok)
2695     {
2696       if (dump_enabled_p ())
2697         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2698                          "bad data dependence.\n");
2699       return ok;
2700     }
2701   if (max_vf != MAX_VECTORIZATION_FACTOR
2702       && maybe_lt (max_vf, min_vf))
2703     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2704   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2705
2706   ok = vect_determine_vectorization_factor (loop_vinfo);
2707   if (!ok)
2708     {
2709       if (dump_enabled_p ())
2710         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2711                          "can't determine vectorization factor.\n");
2712       return ok;
2713     }
2714   if (max_vf != MAX_VECTORIZATION_FACTOR
2715       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2716     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2717
2718   /* Compute the scalar iteration cost.  */
2719   vect_compute_single_scalar_iteration_cost (loop_vinfo);
2720
2721   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2722
2723   if (slp)
2724     {
2725       /* Check the SLP opportunities in the loop, analyze and build
2726          SLP trees.  */
2727       ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo));
2728       if (!ok)
2729         return ok;
2730
2731       /* If there are any SLP instances mark them as pure_slp.  */
2732       slp = vect_make_slp_decision (loop_vinfo);
2733       if (slp)
2734         {
2735           /* Find stmts that need to be both vectorized and SLPed.  */
2736           vect_detect_hybrid_slp (loop_vinfo);
2737
2738           /* Update the vectorization factor based on the SLP decision.  */
2739           vect_update_vf_for_slp (loop_vinfo);
2740
2741           /* Optimize the SLP graph with the vectorization factor fixed.  */
2742           vect_optimize_slp (loop_vinfo);
2743
2744           /* Gather the loads reachable from the SLP graph entries.  */
2745           vect_gather_slp_loads (loop_vinfo);
2746         }
2747     }
2748
2749   bool saved_can_use_partial_vectors_p
2750     = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2751
2752   /* We don't expect to have to roll back to anything other than an empty
2753      set of rgroups.  */
2754   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2755
2756   /* This is the point where we can re-start analysis with SLP forced off.  */
2757 start_over:
2758
2759   /* Apply the suggested unrolling factor, this was determined by the backend
2760      during finish_cost the first time we ran the analyzis for this
2761      vector mode.  */
2762   if (applying_suggested_uf)
2763     LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2764
2765   /* Now the vectorization factor is final.  */
2766   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2767   gcc_assert (known_ne (vectorization_factor, 0U));
2768
2769   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2770     {
2771       dump_printf_loc (MSG_NOTE, vect_location,
2772                        "vectorization_factor = ");
2773       dump_dec (MSG_NOTE, vectorization_factor);
2774       dump_printf (MSG_NOTE, ", niters = %wd\n",
2775                    LOOP_VINFO_INT_NITERS (loop_vinfo));
2776     }
2777
2778   loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2779
2780   /* Analyze the alignment of the data-refs in the loop.
2781      Fail if a data reference is found that cannot be vectorized.  */
2782
2783   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2784   if (!ok)
2785     {
2786       if (dump_enabled_p ())
2787         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2788                          "bad data alignment.\n");
2789       return ok;
2790     }
2791
2792   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2793      It is important to call pruning after vect_analyze_data_ref_accesses,
2794      since we use grouping information gathered by interleaving analysis.  */
2795   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2796   if (!ok)
2797     return ok;
2798
2799   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2800      vectorization, since we do not want to add extra peeling or
2801      add versioning for alignment.  */
2802   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2803     /* This pass will decide on using loop versioning and/or loop peeling in
2804        order to enhance the alignment of data references in the loop.  */
2805     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2806   if (!ok)
2807     return ok;
2808
2809   if (slp)
2810     {
2811       /* Analyze operations in the SLP instances.  Note this may
2812          remove unsupported SLP instances which makes the above
2813          SLP kind detection invalid.  */
2814       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2815       vect_slp_analyze_operations (loop_vinfo);
2816       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2817         {
2818           ok = opt_result::failure_at (vect_location,
2819                                        "unsupported SLP instances\n");
2820           goto again;
2821         }
2822
2823       /* Check whether any load in ALL SLP instances is possibly permuted.  */
2824       slp_tree load_node, slp_root;
2825       unsigned i, x;
2826       slp_instance instance;
2827       bool can_use_lanes = true;
2828       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2829         {
2830           slp_root = SLP_INSTANCE_TREE (instance);
2831           int group_size = SLP_TREE_LANES (slp_root);
2832           tree vectype = SLP_TREE_VECTYPE (slp_root);
2833           bool loads_permuted = false;
2834           FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2835             {
2836               if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2837                 continue;
2838               unsigned j;
2839               stmt_vec_info load_info;
2840               FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2841                 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2842                   {
2843                     loads_permuted = true;
2844                     break;
2845                   }
2846             }
2847
2848           /* If the loads and stores can be handled with load/store-lane
2849              instructions record it and move on to the next instance.  */
2850           if (loads_permuted
2851               && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2852               && vect_store_lanes_supported (vectype, group_size, false)
2853                    != IFN_LAST)
2854             {
2855               FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2856                 {
2857                   stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2858                       (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2859                   /* Use SLP for strided accesses (or if we can't
2860                      load-lanes).  */
2861                   if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2862                       || vect_load_lanes_supported
2863                             (STMT_VINFO_VECTYPE (stmt_vinfo),
2864                              DR_GROUP_SIZE (stmt_vinfo), false) == IFN_LAST)
2865                     break;
2866                 }
2867
2868               can_use_lanes
2869                 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
2870
2871               if (can_use_lanes && dump_enabled_p ())
2872                 dump_printf_loc (MSG_NOTE, vect_location,
2873                                  "SLP instance %p can use load/store-lanes\n",
2874                                  (void *) instance);
2875             }
2876           else
2877             {
2878               can_use_lanes = false;
2879               break;
2880             }
2881         }
2882
2883       /* If all SLP instances can use load/store-lanes abort SLP and try again
2884          with SLP disabled.  */
2885       if (can_use_lanes)
2886         {
2887           ok = opt_result::failure_at (vect_location,
2888                                        "Built SLP cancelled: can use "
2889                                        "load/store-lanes\n");
2890           if (dump_enabled_p ())
2891             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2892                              "Built SLP cancelled: all SLP instances support "
2893                              "load/store-lanes\n");
2894           goto again;
2895         }
2896     }
2897
2898   /* Dissolve SLP-only groups.  */
2899   vect_dissolve_slp_only_groups (loop_vinfo);
2900
2901   /* Scan all the remaining operations in the loop that are not subject
2902      to SLP and make sure they are vectorizable.  */
2903   ok = vect_analyze_loop_operations (loop_vinfo);
2904   if (!ok)
2905     {
2906       if (dump_enabled_p ())
2907         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2908                          "bad operation or unsupported loop bound.\n");
2909       return ok;
2910     }
2911
2912   /* For now, we don't expect to mix both masking and length approaches for one
2913      loop, disable it if both are recorded.  */
2914   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2915       && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2916       && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2917     {
2918       if (dump_enabled_p ())
2919         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2920                          "can't vectorize a loop with partial vectors"
2921                          " because we don't expect to mix different"
2922                          " approaches with partial vectors for the"
2923                          " same loop.\n");
2924       LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2925     }
2926
2927   /* If we still have the option of using partial vectors,
2928      check whether we can generate the necessary loop controls.  */
2929   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
2930     {
2931       if (!LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
2932         {
2933           if (!vect_verify_full_masking (loop_vinfo)
2934               && !vect_verify_full_masking_avx512 (loop_vinfo))
2935             LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2936         }
2937       else /* !LOOP_VINFO_LENS (loop_vinfo).is_empty () */
2938         if (!vect_verify_loop_lens (loop_vinfo))
2939           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2940     }
2941
2942   /* If we're vectorizing a loop that uses length "controls" and
2943      can iterate more than once, we apply decrementing IV approach
2944      in loop control.  */
2945   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2946       && LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_len
2947       && LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0
2948       && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2949            && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
2950                         LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
2951     LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
2952
2953   /* If a loop uses length controls and has a decrementing loop control IV,
2954      we will normally pass that IV through a MIN_EXPR to calcaluate the
2955      basis for the length controls.  E.g. in a loop that processes one
2956      element per scalar iteration, the number of elements would be
2957      MIN_EXPR <N, VF>, where N is the number of scalar iterations left.
2958
2959      This MIN_EXPR approach allows us to use pointer IVs with an invariant
2960      step, since only the final iteration of the vector loop can have
2961      inactive lanes.
2962
2963      However, some targets have a dedicated instruction for calculating the
2964      preferred length, given the total number of elements that still need to
2965      be processed.  This is encapsulated in the SELECT_VL internal function.
2966
2967      If the target supports SELECT_VL, we can use it instead of MIN_EXPR
2968      to determine the basis for the length controls.  However, unlike the
2969      MIN_EXPR calculation, the SELECT_VL calculation can decide to make
2970      lanes inactive in any iteration of the vector loop, not just the last
2971      iteration.  This SELECT_VL approach therefore requires us to use pointer
2972      IVs with variable steps.
2973
2974      Once we've decided how many elements should be processed by one
2975      iteration of the vector loop, we need to populate the rgroup controls.
2976      If a loop has multiple rgroups, we need to make sure that those rgroups
2977      "line up" (that is, they must be consistent about which elements are
2978      active and which aren't).  This is done by vect_adjust_loop_lens_control.
2979
2980      In principle, it would be possible to use vect_adjust_loop_lens_control
2981      on either the result of a MIN_EXPR or the result of a SELECT_VL.
2982      However:
2983
2984      (1) In practice, it only makes sense to use SELECT_VL when a vector
2985          operation will be controlled directly by the result.  It is not
2986          worth using SELECT_VL if it would only be the input to other
2987          calculations.
2988
2989      (2) If we use SELECT_VL for an rgroup that has N controls, each associated
2990          pointer IV will need N updates by a variable amount (N-1 updates
2991          within the iteration and 1 update to move to the next iteration).
2992
2993      Because of this, we prefer to use the MIN_EXPR approach whenever there
2994      is more than one length control.
2995
2996      In addition, SELECT_VL always operates to a granularity of 1 unit.
2997      If we wanted to use it to control an SLP operation on N consecutive
2998      elements, we would need to make the SELECT_VL inputs measure scalar
2999      iterations (rather than elements) and then multiply the SELECT_VL
3000      result by N.  But using SELECT_VL this way is inefficient because
3001      of (1) above.
3002
3003      2. We don't apply SELECT_VL on single-rgroup when both (1) and (2) are
3004         satisfied:
3005
3006      (1). LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) is true.
3007      (2). LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () is true.
3008
3009      Since SELECT_VL (variable step) will make SCEV analysis failed and then
3010      we will fail to gain benefits of following unroll optimizations. We prefer
3011      using the MIN_EXPR approach in this situation.  */
3012   if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
3013     {
3014       tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
3015       if (direct_internal_fn_supported_p (IFN_SELECT_VL, iv_type,
3016                                           OPTIMIZE_FOR_SPEED)
3017           && LOOP_VINFO_LENS (loop_vinfo).length () == 1
3018           && LOOP_VINFO_LENS (loop_vinfo)[0].factor == 1 && !slp
3019           && (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3020               || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()))
3021         LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true;
3022     }
3023
3024   /* Decide whether this loop_vinfo should use partial vectors or peeling,
3025      assuming that the loop will be used as a main loop.  We will redo
3026      this analysis later if we instead decide to use the loop as an
3027      epilogue loop.  */
3028   ok = vect_determine_partial_vectors_and_peeling (loop_vinfo);
3029   if (!ok)
3030     return ok;
3031
3032   /* If we're vectorizing an epilogue loop, the vectorized loop either needs
3033      to be able to handle fewer than VF scalars, or needs to have a lower VF
3034      than the main loop.  */
3035   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
3036       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3037     {
3038       poly_uint64 unscaled_vf
3039         = exact_div (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
3040                      orig_loop_vinfo->suggested_unroll_factor);
3041       if (maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo), unscaled_vf))
3042         return opt_result::failure_at (vect_location,
3043                                        "Vectorization factor too high for"
3044                                        " epilogue loop.\n");
3045     }
3046
3047   /* Check the costings of the loop make vectorizing worthwhile.  */
3048   res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
3049   if (res < 0)
3050     {
3051       ok = opt_result::failure_at (vect_location,
3052                                    "Loop costings may not be worthwhile.\n");
3053       goto again;
3054     }
3055   if (!res)
3056     return opt_result::failure_at (vect_location,
3057                                    "Loop costings not worthwhile.\n");
3058
3059   /* If an epilogue loop is required make sure we can create one.  */
3060   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
3061       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
3062     {
3063       if (dump_enabled_p ())
3064         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
3065       if (!vect_can_advance_ivs_p (loop_vinfo)
3066           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
3067                                            single_exit (LOOP_VINFO_LOOP
3068                                                          (loop_vinfo))))
3069         {
3070           ok = opt_result::failure_at (vect_location,
3071                                        "not vectorized: can't create required "
3072                                        "epilog loop\n");
3073           goto again;
3074         }
3075     }
3076
3077   /* During peeling, we need to check if number of loop iterations is
3078      enough for both peeled prolog loop and vector loop.  This check
3079      can be merged along with threshold check of loop versioning, so
3080      increase threshold for this case if necessary.
3081
3082      If we are analyzing an epilogue we still want to check what its
3083      versioning threshold would be.  If we decide to vectorize the epilogues we
3084      will want to use the lowest versioning threshold of all epilogues and main
3085      loop.  This will enable us to enter a vectorized epilogue even when
3086      versioning the loop.  We can't simply check whether the epilogue requires
3087      versioning though since we may have skipped some versioning checks when
3088      analyzing the epilogue.  For instance, checks for alias versioning will be
3089      skipped when dealing with epilogues as we assume we already checked them
3090      for the main loop.  So instead we always check the 'orig_loop_vinfo'.  */
3091   if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
3092     {
3093       poly_uint64 niters_th = 0;
3094       unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3095
3096       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
3097         {
3098           /* Niters for peeled prolog loop.  */
3099           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3100             {
3101               dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
3102               tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
3103               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
3104             }
3105           else
3106             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3107         }
3108
3109       /* Niters for at least one iteration of vectorized loop.  */
3110       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3111         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3112       /* One additional iteration because of peeling for gap.  */
3113       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3114         niters_th += 1;
3115
3116       /*  Use the same condition as vect_transform_loop to decide when to use
3117           the cost to determine a versioning threshold.  */
3118       if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3119           && ordered_p (th, niters_th))
3120         niters_th = ordered_max (poly_uint64 (th), niters_th);
3121
3122       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
3123     }
3124
3125   gcc_assert (known_eq (vectorization_factor,
3126                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
3127
3128   slp_done_for_suggested_uf = slp;
3129
3130   /* Ok to vectorize!  */
3131   LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
3132   return opt_result::success ();
3133
3134 again:
3135   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
3136   gcc_assert (!ok);
3137
3138   /* Try again with SLP forced off but if we didn't do any SLP there is
3139      no point in re-trying.  */
3140   if (!slp)
3141     return ok;
3142
3143   /* If the slp decision is true when suggested unroll factor is worked
3144      out, and we are applying suggested unroll factor, we don't need to
3145      re-try any more.  */
3146   if (applying_suggested_uf && slp_done_for_suggested_uf)
3147     return ok;
3148
3149   /* If there are reduction chains re-trying will fail anyway.  */
3150   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
3151     return ok;
3152
3153   /* Likewise if the grouped loads or stores in the SLP cannot be handled
3154      via interleaving or lane instructions.  */
3155   slp_instance instance;
3156   slp_tree node;
3157   unsigned i, j;
3158   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
3159     {
3160       stmt_vec_info vinfo;
3161       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
3162       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
3163         continue;
3164       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3165       unsigned int size = DR_GROUP_SIZE (vinfo);
3166       tree vectype = STMT_VINFO_VECTYPE (vinfo);
3167       if (vect_store_lanes_supported (vectype, size, false) == IFN_LAST
3168          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
3169          && ! vect_grouped_store_supported (vectype, size))
3170         return opt_result::failure_at (vinfo->stmt,
3171                                        "unsupported grouped store\n");
3172       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
3173         {
3174           vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
3175           vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3176           bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
3177           size = DR_GROUP_SIZE (vinfo);
3178           vectype = STMT_VINFO_VECTYPE (vinfo);
3179           if (vect_load_lanes_supported (vectype, size, false) == IFN_LAST
3180               && ! vect_grouped_load_supported (vectype, single_element_p,
3181                                                 size))
3182             return opt_result::failure_at (vinfo->stmt,
3183                                            "unsupported grouped load\n");
3184         }
3185     }
3186
3187   if (dump_enabled_p ())
3188     dump_printf_loc (MSG_NOTE, vect_location,
3189                      "re-trying with SLP disabled\n");
3190
3191   /* Roll back state appropriately.  No SLP this time.  */
3192   slp = false;
3193   /* Restore vectorization factor as it were without SLP.  */
3194   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
3195   /* Free the SLP instances.  */
3196   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
3197     vect_free_slp_instance (instance);
3198   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
3199   /* Reset SLP type to loop_vect on all stmts.  */
3200   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
3201     {
3202       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3203       for (gimple_stmt_iterator si = gsi_start_phis (bb);
3204            !gsi_end_p (si); gsi_next (&si))
3205         {
3206           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3207           STMT_SLP_TYPE (stmt_info) = loop_vect;
3208           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
3209               || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
3210             {
3211               /* vectorizable_reduction adjusts reduction stmt def-types,
3212                  restore them to that of the PHI.  */
3213               STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
3214                 = STMT_VINFO_DEF_TYPE (stmt_info);
3215               STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
3216                                         (STMT_VINFO_REDUC_DEF (stmt_info)))
3217                 = STMT_VINFO_DEF_TYPE (stmt_info);
3218             }
3219         }
3220       for (gimple_stmt_iterator si = gsi_start_bb (bb);
3221            !gsi_end_p (si); gsi_next (&si))
3222         {
3223           if (is_gimple_debug (gsi_stmt (si)))
3224             continue;
3225           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3226           STMT_SLP_TYPE (stmt_info) = loop_vect;
3227           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
3228             {
3229               stmt_vec_info pattern_stmt_info
3230                 = STMT_VINFO_RELATED_STMT (stmt_info);
3231               if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
3232                 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
3233
3234               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
3235               STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
3236               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
3237                    !gsi_end_p (pi); gsi_next (&pi))
3238                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
3239                   = loop_vect;
3240             }
3241         }
3242     }
3243   /* Free optimized alias test DDRS.  */
3244   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
3245   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
3246   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
3247   /* Reset target cost data.  */
3248   delete loop_vinfo->vector_costs;
3249   loop_vinfo->vector_costs = nullptr;
3250   /* Reset accumulated rgroup information.  */
3251   LOOP_VINFO_MASKS (loop_vinfo).mask_set.empty ();
3252   release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
3253   release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
3254   /* Reset assorted flags.  */
3255   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
3256   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
3257   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
3258   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
3259   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3260     = saved_can_use_partial_vectors_p;
3261
3262   goto start_over;
3263 }
3264
3265 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
3266    to be better than vectorizing it using OLD_LOOP_VINFO.  Assume that
3267    OLD_LOOP_VINFO is better unless something specifically indicates
3268    otherwise.
3269
3270    Note that this deliberately isn't a partial order.  */
3271
3272 static bool
3273 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
3274                           loop_vec_info old_loop_vinfo)
3275 {
3276   struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
3277   gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
3278
3279   poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
3280   poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
3281
3282   /* Always prefer a VF of loop->simdlen over any other VF.  */
3283   if (loop->simdlen)
3284     {
3285       bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
3286       bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
3287       if (new_simdlen_p != old_simdlen_p)
3288         return new_simdlen_p;
3289     }
3290
3291   const auto *old_costs = old_loop_vinfo->vector_costs;
3292   const auto *new_costs = new_loop_vinfo->vector_costs;
3293   if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
3294     return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
3295
3296   return new_costs->better_main_loop_than_p (old_costs);
3297 }
3298
3299 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO.  Return
3300    true if we should.  */
3301
3302 static bool
3303 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
3304                         loop_vec_info old_loop_vinfo)
3305 {
3306   if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
3307     return false;
3308
3309   if (dump_enabled_p ())
3310     dump_printf_loc (MSG_NOTE, vect_location,
3311                      "***** Preferring vector mode %s to vector mode %s\n",
3312                      GET_MODE_NAME (new_loop_vinfo->vector_mode),
3313                      GET_MODE_NAME (old_loop_vinfo->vector_mode));
3314   return true;
3315 }
3316
3317 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
3318    not NULL.  Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
3319    MODE_I to the next mode useful to analyze.
3320    Return the loop_vinfo on success and wrapped null on failure.  */
3321
3322 static opt_loop_vec_info
3323 vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
3324                      const vect_loop_form_info *loop_form_info,
3325                      loop_vec_info main_loop_vinfo,
3326                      const vector_modes &vector_modes, unsigned &mode_i,
3327                      machine_mode &autodetected_vector_mode,
3328                      bool &fatal)
3329 {
3330   loop_vec_info loop_vinfo
3331     = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3332
3333   machine_mode vector_mode = vector_modes[mode_i];
3334   loop_vinfo->vector_mode = vector_mode;
3335   unsigned int suggested_unroll_factor = 1;
3336   bool slp_done_for_suggested_uf = false;
3337
3338   /* Run the main analysis.  */
3339   opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,
3340                                         &suggested_unroll_factor,
3341                                         slp_done_for_suggested_uf);
3342   if (dump_enabled_p ())
3343     dump_printf_loc (MSG_NOTE, vect_location,
3344                      "***** Analysis %s with vector mode %s\n",
3345                      res ? "succeeded" : " failed",
3346                      GET_MODE_NAME (loop_vinfo->vector_mode));
3347
3348   if (res && !main_loop_vinfo && suggested_unroll_factor > 1)
3349     {
3350       if (dump_enabled_p ())
3351         dump_printf_loc (MSG_NOTE, vect_location,
3352                          "***** Re-trying analysis for unrolling"
3353                          " with unroll factor %d and slp %s.\n",
3354                          suggested_unroll_factor,
3355                          slp_done_for_suggested_uf ? "on" : "off");
3356       loop_vec_info unroll_vinfo
3357         = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3358       unroll_vinfo->vector_mode = vector_mode;
3359       unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
3360       opt_result new_res = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL,
3361                                                 slp_done_for_suggested_uf);
3362       if (new_res)
3363         {
3364           delete loop_vinfo;
3365           loop_vinfo = unroll_vinfo;
3366         }
3367       else
3368         delete unroll_vinfo;
3369     }
3370
3371   /* Remember the autodetected vector mode.  */
3372   if (vector_mode == VOIDmode)
3373     autodetected_vector_mode = loop_vinfo->vector_mode;
3374
3375   /* Advance mode_i, first skipping modes that would result in the
3376      same analysis result.  */
3377   while (mode_i + 1 < vector_modes.length ()
3378          && vect_chooses_same_modes_p (loop_vinfo,
3379                                        vector_modes[mode_i + 1]))
3380     {
3381       if (dump_enabled_p ())
3382         dump_printf_loc (MSG_NOTE, vect_location,
3383                          "***** The result for vector mode %s would"
3384                          " be the same\n",
3385                          GET_MODE_NAME (vector_modes[mode_i + 1]));
3386       mode_i += 1;
3387     }
3388   if (mode_i + 1 < vector_modes.length ()
3389       && VECTOR_MODE_P (autodetected_vector_mode)
3390       && (related_vector_mode (vector_modes[mode_i + 1],
3391                                GET_MODE_INNER (autodetected_vector_mode))
3392           == autodetected_vector_mode)
3393       && (related_vector_mode (autodetected_vector_mode,
3394                                GET_MODE_INNER (vector_modes[mode_i + 1]))
3395           == vector_modes[mode_i + 1]))
3396     {
3397       if (dump_enabled_p ())
3398         dump_printf_loc (MSG_NOTE, vect_location,
3399                          "***** Skipping vector mode %s, which would"
3400                          " repeat the analysis for %s\n",
3401                          GET_MODE_NAME (vector_modes[mode_i + 1]),
3402                          GET_MODE_NAME (autodetected_vector_mode));
3403       mode_i += 1;
3404     }
3405   mode_i++;
3406
3407   if (!res)
3408     {
3409       delete loop_vinfo;
3410       if (fatal)
3411         gcc_checking_assert (main_loop_vinfo == NULL);
3412       return opt_loop_vec_info::propagate_failure (res);
3413     }
3414
3415   return opt_loop_vec_info::success (loop_vinfo);
3416 }
3417
3418 /* Function vect_analyze_loop.
3419
3420    Apply a set of analyses on LOOP, and create a loop_vec_info struct
3421    for it.  The different analyses will record information in the
3422    loop_vec_info struct.  */
3423 opt_loop_vec_info
3424 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
3425 {
3426   DUMP_VECT_SCOPE ("analyze_loop_nest");
3427
3428   if (loop_outer (loop)
3429       && loop_vec_info_for_loop (loop_outer (loop))
3430       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
3431     return opt_loop_vec_info::failure_at (vect_location,
3432                                           "outer-loop already vectorized.\n");
3433
3434   if (!find_loop_nest (loop, &shared->loop_nest))
3435     return opt_loop_vec_info::failure_at
3436       (vect_location,
3437        "not vectorized: loop nest containing two or more consecutive inner"
3438        " loops cannot be vectorized\n");
3439
3440   /* Analyze the loop form.  */
3441   vect_loop_form_info loop_form_info;
3442   opt_result res = vect_analyze_loop_form (loop, &loop_form_info);
3443   if (!res)
3444     {
3445       if (dump_enabled_p ())
3446         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3447                          "bad loop form.\n");
3448       return opt_loop_vec_info::propagate_failure (res);
3449     }
3450   if (!integer_onep (loop_form_info.assumptions))
3451     {
3452       /* We consider to vectorize this loop by versioning it under
3453          some assumptions.  In order to do this, we need to clear
3454          existing information computed by scev and niter analyzer.  */
3455       scev_reset_htab ();
3456       free_numbers_of_iterations_estimates (loop);
3457       /* Also set flag for this loop so that following scev and niter
3458          analysis are done under the assumptions.  */
3459       loop_constraint_set (loop, LOOP_C_FINITE);
3460     }
3461
3462   auto_vector_modes vector_modes;
3463   /* Autodetect first vector size we try.  */
3464   vector_modes.safe_push (VOIDmode);
3465   unsigned int autovec_flags
3466     = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
3467                                                     loop->simdlen != 0);
3468   bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
3469                              && !unlimited_cost_model (loop));
3470   machine_mode autodetected_vector_mode = VOIDmode;
3471   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
3472   unsigned int mode_i = 0;
3473   unsigned HOST_WIDE_INT simdlen = loop->simdlen;
3474
3475   /* Keep track of the VF for each mode.  Initialize all to 0 which indicates
3476      a mode has not been analyzed.  */
3477   auto_vec<poly_uint64, 8> cached_vf_per_mode;
3478   for (unsigned i = 0; i < vector_modes.length (); ++i)
3479     cached_vf_per_mode.safe_push (0);
3480
3481   /* First determine the main loop vectorization mode, either the first
3482      one that works, starting with auto-detecting the vector mode and then
3483      following the targets order of preference, or the one with the
3484      lowest cost if pick_lowest_cost_p.  */
3485   while (1)
3486     {
3487       bool fatal;
3488       unsigned int last_mode_i = mode_i;
3489       /* Set cached VF to -1 prior to analysis, which indicates a mode has
3490          failed.  */
3491       cached_vf_per_mode[last_mode_i] = -1;
3492       opt_loop_vec_info loop_vinfo
3493         = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3494                                NULL, vector_modes, mode_i,
3495                                autodetected_vector_mode, fatal);
3496       if (fatal)
3497         break;
3498
3499       if (loop_vinfo)
3500         {
3501           /*  Analyzis has been successful so update the VF value.  The
3502               VF should always be a multiple of unroll_factor and we want to
3503               capture the original VF here.  */
3504           cached_vf_per_mode[last_mode_i]
3505             = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
3506                          loop_vinfo->suggested_unroll_factor);
3507           /* Once we hit the desired simdlen for the first time,
3508              discard any previous attempts.  */
3509           if (simdlen
3510               && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3511             {
3512               delete first_loop_vinfo;
3513               first_loop_vinfo = opt_loop_vec_info::success (NULL);
3514               simdlen = 0;
3515             }
3516           else if (pick_lowest_cost_p
3517                    && first_loop_vinfo
3518                    && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3519             {
3520               /* Pick loop_vinfo over first_loop_vinfo.  */
3521               delete first_loop_vinfo;
3522               first_loop_vinfo = opt_loop_vec_info::success (NULL);
3523             }
3524           if (first_loop_vinfo == NULL)
3525             first_loop_vinfo = loop_vinfo;
3526           else
3527             {
3528               delete loop_vinfo;
3529               loop_vinfo = opt_loop_vec_info::success (NULL);
3530             }
3531
3532           /* Commit to first_loop_vinfo if we have no reason to try
3533              alternatives.  */
3534           if (!simdlen && !pick_lowest_cost_p)
3535             break;
3536         }
3537       if (mode_i == vector_modes.length ()
3538           || autodetected_vector_mode == VOIDmode)
3539         break;
3540
3541       /* Try the next biggest vector size.  */
3542       if (dump_enabled_p ())
3543         dump_printf_loc (MSG_NOTE, vect_location,
3544                          "***** Re-trying analysis with vector mode %s\n",
3545                          GET_MODE_NAME (vector_modes[mode_i]));
3546     }
3547   if (!first_loop_vinfo)
3548     return opt_loop_vec_info::propagate_failure (res);
3549
3550   if (dump_enabled_p ())
3551     dump_printf_loc (MSG_NOTE, vect_location,
3552                      "***** Choosing vector mode %s\n",
3553                      GET_MODE_NAME (first_loop_vinfo->vector_mode));
3554
3555   /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3556      enabled, SIMDUID is not set, it is the innermost loop and we have
3557      either already found the loop's SIMDLEN or there was no SIMDLEN to
3558      begin with.
3559      TODO: Enable epilogue vectorization for loops with SIMDUID set.  */
3560   bool vect_epilogues = (!simdlen
3561                          && loop->inner == NULL
3562                          && param_vect_epilogues_nomask
3563                          && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3564                          && !loop->simduid);
3565   if (!vect_epilogues)
3566     return first_loop_vinfo;
3567
3568   /* Now analyze first_loop_vinfo for epilogue vectorization.  */
3569   poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3570
3571   /* For epilogues start the analysis from the first mode.  The motivation
3572      behind starting from the beginning comes from cases where the VECTOR_MODES
3573      array may contain length-agnostic and length-specific modes.  Their
3574      ordering is not guaranteed, so we could end up picking a mode for the main
3575      loop that is after the epilogue's optimal mode.  */
3576   vector_modes[0] = autodetected_vector_mode;
3577   mode_i = 0;
3578
3579   bool supports_partial_vectors =
3580     partial_vectors_supported_p () && param_vect_partial_vector_usage != 0;
3581   poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3582
3583   while (1)
3584     {
3585       /* If the target does not support partial vectors we can shorten the
3586          number of modes to analyze for the epilogue as we know we can't pick a
3587          mode that would lead to a VF at least as big as the
3588          FIRST_VINFO_VF.  */
3589       if (!supports_partial_vectors
3590           && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3591         {
3592           mode_i++;
3593           if (mode_i == vector_modes.length ())
3594             break;
3595           continue;
3596         }
3597
3598       if (dump_enabled_p ())
3599         dump_printf_loc (MSG_NOTE, vect_location,
3600                          "***** Re-trying epilogue analysis with vector "
3601                          "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3602
3603       bool fatal;
3604       opt_loop_vec_info loop_vinfo
3605         = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3606                                first_loop_vinfo,
3607                                vector_modes, mode_i,
3608                                autodetected_vector_mode, fatal);
3609       if (fatal)
3610         break;
3611
3612       if (loop_vinfo)
3613         {
3614           if (pick_lowest_cost_p)
3615             {
3616               /* Keep trying to roll back vectorization attempts while the
3617                  loop_vec_infos they produced were worse than this one.  */
3618               vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3619               while (!vinfos.is_empty ()
3620                      && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3621                 {
3622                   gcc_assert (vect_epilogues);
3623                   delete vinfos.pop ();
3624                 }
3625             }
3626           /* For now only allow one epilogue loop.  */
3627           if (first_loop_vinfo->epilogue_vinfos.is_empty ())
3628             {
3629               first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3630               poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3631               gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3632                           || maybe_ne (lowest_th, 0U));
3633               /* Keep track of the known smallest versioning
3634                  threshold.  */
3635               if (ordered_p (lowest_th, th))
3636                 lowest_th = ordered_min (lowest_th, th);
3637             }
3638           else
3639             {
3640               delete loop_vinfo;
3641               loop_vinfo = opt_loop_vec_info::success (NULL);
3642             }
3643
3644           /* For now only allow one epilogue loop, but allow
3645              pick_lowest_cost_p to replace it, so commit to the
3646              first epilogue if we have no reason to try alternatives.  */
3647           if (!pick_lowest_cost_p)
3648             break;
3649         }
3650
3651       if (mode_i == vector_modes.length ())
3652         break;
3653
3654     }
3655
3656   if (!first_loop_vinfo->epilogue_vinfos.is_empty ())
3657     {
3658       LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3659       if (dump_enabled_p ())
3660         dump_printf_loc (MSG_NOTE, vect_location,
3661                          "***** Choosing epilogue vector mode %s\n",
3662                          GET_MODE_NAME
3663                            (first_loop_vinfo->epilogue_vinfos[0]->vector_mode));
3664     }
3665
3666   return first_loop_vinfo;
3667 }
3668
3669 /* Return true if there is an in-order reduction function for CODE, storing
3670    it in *REDUC_FN if so.  */
3671
3672 static bool
3673 fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3674 {
3675   if (code == PLUS_EXPR)
3676     {
3677       *reduc_fn = IFN_FOLD_LEFT_PLUS;
3678       return true;
3679     }
3680   return false;
3681 }
3682
3683 /* Function reduction_fn_for_scalar_code
3684
3685    Input:
3686    CODE - tree_code of a reduction operations.
3687
3688    Output:
3689    REDUC_FN - the corresponding internal function to be used to reduce the
3690       vector of partial results into a single scalar result, or IFN_LAST
3691       if the operation is a supported reduction operation, but does not have
3692       such an internal function.
3693
3694    Return FALSE if CODE currently cannot be vectorized as reduction.  */
3695
3696 bool
3697 reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3698 {
3699   if (code.is_tree_code ())
3700     switch (tree_code (code))
3701       {
3702       case MAX_EXPR:
3703         *reduc_fn = IFN_REDUC_MAX;
3704         return true;
3705
3706       case MIN_EXPR:
3707         *reduc_fn = IFN_REDUC_MIN;
3708         return true;
3709
3710       case PLUS_EXPR:
3711         *reduc_fn = IFN_REDUC_PLUS;
3712         return true;
3713
3714       case BIT_AND_EXPR:
3715         *reduc_fn = IFN_REDUC_AND;
3716         return true;
3717
3718       case BIT_IOR_EXPR:
3719         *reduc_fn = IFN_REDUC_IOR;
3720         return true;
3721
3722       case BIT_XOR_EXPR:
3723         *reduc_fn = IFN_REDUC_XOR;
3724         return true;
3725
3726       case MULT_EXPR:
3727       case MINUS_EXPR:
3728         *reduc_fn = IFN_LAST;
3729         return true;
3730
3731       default:
3732         return false;
3733       }
3734   else
3735     switch (combined_fn (code))
3736       {
3737       CASE_CFN_FMAX:
3738         *reduc_fn = IFN_REDUC_FMAX;
3739         return true;
3740
3741       CASE_CFN_FMIN:
3742         *reduc_fn = IFN_REDUC_FMIN;
3743         return true;
3744
3745       default:
3746         return false;
3747       }
3748 }
3749
3750 /* If there is a neutral value X such that a reduction would not be affected
3751    by the introduction of additional X elements, return that X, otherwise
3752    return null.  CODE is the code of the reduction and SCALAR_TYPE is type
3753    of the scalar elements.  If the reduction has just a single initial value
3754    then INITIAL_VALUE is that value, otherwise it is null.  */
3755
3756 tree
3757 neutral_op_for_reduction (tree scalar_type, code_helper code,
3758                           tree initial_value)
3759 {
3760   if (code.is_tree_code ())
3761     switch (tree_code (code))
3762       {
3763       case WIDEN_SUM_EXPR:
3764       case DOT_PROD_EXPR:
3765       case SAD_EXPR:
3766       case PLUS_EXPR:
3767       case MINUS_EXPR:
3768       case BIT_IOR_EXPR:
3769       case BIT_XOR_EXPR:
3770         return build_zero_cst (scalar_type);
3771
3772       case MULT_EXPR:
3773         return build_one_cst (scalar_type);
3774
3775       case BIT_AND_EXPR:
3776         return build_all_ones_cst (scalar_type);
3777
3778       case MAX_EXPR:
3779       case MIN_EXPR:
3780         return initial_value;
3781
3782       default:
3783         return NULL_TREE;
3784       }
3785   else
3786     switch (combined_fn (code))
3787       {
3788       CASE_CFN_FMIN:
3789       CASE_CFN_FMAX:
3790         return initial_value;
3791
3792       default:
3793         return NULL_TREE;
3794       }
3795 }
3796
3797 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
3798    STMT is printed with a message MSG. */
3799
3800 static void
3801 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3802 {
3803   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3804 }
3805
3806 /* Return true if we need an in-order reduction for operation CODE
3807    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3808    overflow must wrap.  */
3809
3810 bool
3811 needs_fold_left_reduction_p (tree type, code_helper code)
3812 {
3813   /* CHECKME: check for !flag_finite_math_only too?  */
3814   if (SCALAR_FLOAT_TYPE_P (type))
3815     {
3816       if (code.is_tree_code ())
3817         switch (tree_code (code))
3818           {
3819           case MIN_EXPR:
3820           case MAX_EXPR:
3821             return false;
3822
3823           default:
3824             return !flag_associative_math;
3825           }
3826       else
3827         switch (combined_fn (code))
3828           {
3829           CASE_CFN_FMIN:
3830           CASE_CFN_FMAX:
3831             return false;
3832
3833           default:
3834             return !flag_associative_math;
3835           }
3836     }
3837
3838   if (INTEGRAL_TYPE_P (type))
3839     return (!code.is_tree_code ()
3840             || !operation_no_trapping_overflow (type, tree_code (code)));
3841
3842   if (SAT_FIXED_POINT_TYPE_P (type))
3843     return true;
3844
3845   return false;
3846 }
3847
3848 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3849    has a handled computation expression.  Store the main reduction
3850    operation in *CODE.  */
3851
3852 static bool
3853 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3854                       tree loop_arg, code_helper *code,
3855                       vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3856 {
3857   auto_bitmap visited;
3858   tree lookfor = PHI_RESULT (phi);
3859   ssa_op_iter curri;
3860   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3861   while (USE_FROM_PTR (curr) != loop_arg)
3862     curr = op_iter_next_use (&curri);
3863   curri.i = curri.numops;
3864   do
3865     {
3866       path.safe_push (std::make_pair (curri, curr));
3867       tree use = USE_FROM_PTR (curr);
3868       if (use == lookfor)
3869         break;
3870       gimple *def = SSA_NAME_DEF_STMT (use);
3871       if (gimple_nop_p (def)
3872           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3873         {
3874 pop:
3875           do
3876             {
3877               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3878               curri = x.first;
3879               curr = x.second;
3880               do
3881                 curr = op_iter_next_use (&curri);
3882               /* Skip already visited or non-SSA operands (from iterating
3883                  over PHI args).  */
3884               while (curr != NULL_USE_OPERAND_P
3885                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3886                          || ! bitmap_set_bit (visited,
3887                                               SSA_NAME_VERSION
3888                                                 (USE_FROM_PTR (curr)))));
3889             }
3890           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3891           if (curr == NULL_USE_OPERAND_P)
3892             break;
3893         }
3894       else
3895         {
3896           if (gimple_code (def) == GIMPLE_PHI)
3897             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3898           else
3899             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3900           while (curr != NULL_USE_OPERAND_P
3901                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3902                      || ! bitmap_set_bit (visited,
3903                                           SSA_NAME_VERSION
3904                                             (USE_FROM_PTR (curr)))))
3905             curr = op_iter_next_use (&curri);
3906           if (curr == NULL_USE_OPERAND_P)
3907             goto pop;
3908         }
3909     }
3910   while (1);
3911   if (dump_file && (dump_flags & TDF_DETAILS))
3912     {
3913       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3914       unsigned i;
3915       std::pair<ssa_op_iter, use_operand_p> *x;
3916       FOR_EACH_VEC_ELT (path, i, x)
3917         dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3918       dump_printf (MSG_NOTE, "\n");
3919     }
3920
3921   /* Check whether the reduction path detected is valid.  */
3922   bool fail = path.length () == 0;
3923   bool neg = false;
3924   int sign = -1;
3925   *code = ERROR_MARK;
3926   for (unsigned i = 1; i < path.length (); ++i)
3927     {
3928       gimple *use_stmt = USE_STMT (path[i].second);
3929       gimple_match_op op;
3930       if (!gimple_extract_op (use_stmt, &op))
3931         {
3932           fail = true;
3933           break;
3934         }
3935       unsigned int opi = op.num_ops;
3936       if (gassign *assign = dyn_cast<gassign *> (use_stmt))
3937         {
3938           /* The following make sure we can compute the operand index
3939              easily plus it mostly disallows chaining via COND_EXPR condition
3940              operands.  */
3941           for (opi = 0; opi < op.num_ops; ++opi)
3942             if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
3943               break;
3944         }
3945       else if (gcall *call = dyn_cast<gcall *> (use_stmt))
3946         {
3947           for (opi = 0; opi < op.num_ops; ++opi)
3948             if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
3949               break;
3950         }
3951       if (opi == op.num_ops)
3952         {
3953           fail = true;
3954           break;
3955         }
3956       op.code = canonicalize_code (op.code, op.type);
3957       if (op.code == MINUS_EXPR)
3958         {
3959           op.code = PLUS_EXPR;
3960           /* Track whether we negate the reduction value each iteration.  */
3961           if (op.ops[1] == op.ops[opi])
3962             neg = ! neg;
3963         }
3964       if (CONVERT_EXPR_CODE_P (op.code)
3965           && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
3966         ;
3967       else if (*code == ERROR_MARK)
3968         {
3969           *code = op.code;
3970           sign = TYPE_SIGN (op.type);
3971         }
3972       else if (op.code != *code)
3973         {
3974           fail = true;
3975           break;
3976         }
3977       else if ((op.code == MIN_EXPR
3978                 || op.code == MAX_EXPR)
3979                && sign != TYPE_SIGN (op.type))
3980         {
3981           fail = true;
3982           break;
3983         }
3984       /* Check there's only a single stmt the op is used on.  For the
3985          not value-changing tail and the last stmt allow out-of-loop uses.
3986          ???  We could relax this and handle arbitrary live stmts by
3987          forcing a scalar epilogue for example.  */
3988       imm_use_iterator imm_iter;
3989       use_operand_p use_p;
3990       gimple *op_use_stmt;
3991       unsigned cnt = 0;
3992       FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
3993         if (!is_gimple_debug (op_use_stmt)
3994             && (*code != ERROR_MARK
3995                 || flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))))
3996           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3997             cnt++;
3998       if (cnt != 1)
3999         {
4000           fail = true;
4001           break;
4002         }
4003     }
4004   return ! fail && ! neg && *code != ERROR_MARK;
4005 }
4006
4007 bool
4008 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
4009                       tree loop_arg, enum tree_code code)
4010 {
4011   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4012   code_helper code_;
4013   return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
4014           && code_ == code);
4015 }
4016
4017
4018
4019 /* Function vect_is_simple_reduction
4020
4021    (1) Detect a cross-iteration def-use cycle that represents a simple
4022    reduction computation.  We look for the following pattern:
4023
4024    loop_header:
4025      a1 = phi < a0, a2 >
4026      a3 = ...
4027      a2 = operation (a3, a1)
4028
4029    or
4030
4031    a3 = ...
4032    loop_header:
4033      a1 = phi < a0, a2 >
4034      a2 = operation (a3, a1)
4035
4036    such that:
4037    1. operation is commutative and associative and it is safe to
4038       change the order of the computation
4039    2. no uses for a2 in the loop (a2 is used out of the loop)
4040    3. no uses of a1 in the loop besides the reduction operation
4041    4. no uses of a1 outside the loop.
4042
4043    Conditions 1,4 are tested here.
4044    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
4045
4046    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
4047    nested cycles.
4048
4049    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
4050    reductions:
4051
4052      a1 = phi < a0, a2 >
4053      inner loop (def of a3)
4054      a2 = phi < a3 >
4055
4056    (4) Detect condition expressions, ie:
4057      for (int i = 0; i < N; i++)
4058        if (a[i] < val)
4059         ret_val = a[i];
4060
4061 */
4062
4063 static stmt_vec_info
4064 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
4065                           bool *double_reduc, bool *reduc_chain_p, bool slp)
4066 {
4067   gphi *phi = as_a <gphi *> (phi_info->stmt);
4068   gimple *phi_use_stmt = NULL;
4069   imm_use_iterator imm_iter;
4070   use_operand_p use_p;
4071
4072   *double_reduc = false;
4073   *reduc_chain_p = false;
4074   STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
4075
4076   tree phi_name = PHI_RESULT (phi);
4077   /* ???  If there are no uses of the PHI result the inner loop reduction
4078      won't be detected as possibly double-reduction by vectorizable_reduction
4079      because that tries to walk the PHI arg from the preheader edge which
4080      can be constant.  See PR60382.  */
4081   if (has_zero_uses (phi_name))
4082     return NULL;
4083   class loop *loop = (gimple_bb (phi))->loop_father;
4084   unsigned nphi_def_loop_uses = 0;
4085   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
4086     {
4087       gimple *use_stmt = USE_STMT (use_p);
4088       if (is_gimple_debug (use_stmt))
4089         continue;
4090
4091       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4092         {
4093           if (dump_enabled_p ())
4094             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4095                              "intermediate value used outside loop.\n");
4096
4097           return NULL;
4098         }
4099
4100       nphi_def_loop_uses++;
4101       phi_use_stmt = use_stmt;
4102     }
4103
4104   tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
4105   if (TREE_CODE (latch_def) != SSA_NAME)
4106     {
4107       if (dump_enabled_p ())
4108         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4109                          "reduction: not ssa_name: %T\n", latch_def);
4110       return NULL;
4111     }
4112
4113   stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
4114   if (!def_stmt_info
4115       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
4116     return NULL;
4117
4118   bool nested_in_vect_loop
4119     = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
4120   unsigned nlatch_def_loop_uses = 0;
4121   auto_vec<gphi *, 3> lcphis;
4122   bool inner_loop_of_double_reduc = false;
4123   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
4124     {
4125       gimple *use_stmt = USE_STMT (use_p);
4126       if (is_gimple_debug (use_stmt))
4127         continue;
4128       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4129         nlatch_def_loop_uses++;
4130       else
4131         {
4132           /* We can have more than one loop-closed PHI.  */
4133           lcphis.safe_push (as_a <gphi *> (use_stmt));
4134           if (nested_in_vect_loop
4135               && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
4136                   == vect_double_reduction_def))
4137             inner_loop_of_double_reduc = true;
4138         }
4139     }
4140
4141   /* If we are vectorizing an inner reduction we are executing that
4142      in the original order only in case we are not dealing with a
4143      double reduction.  */
4144   if (nested_in_vect_loop && !inner_loop_of_double_reduc)
4145     {
4146       if (dump_enabled_p ())
4147         report_vect_op (MSG_NOTE, def_stmt_info->stmt,
4148                         "detected nested cycle: ");
4149       return def_stmt_info;
4150     }
4151
4152   /* When the inner loop of a double reduction ends up with more than
4153      one loop-closed PHI we have failed to classify alternate such
4154      PHIs as double reduction, leading to wrong code.  See PR103237.  */
4155   if (inner_loop_of_double_reduc && lcphis.length () != 1)
4156     {
4157       if (dump_enabled_p ())
4158         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4159                          "unhandle double reduction\n");
4160       return NULL;
4161     }
4162
4163   /* If this isn't a nested cycle or if the nested cycle reduction value
4164      is used ouside of the inner loop we cannot handle uses of the reduction
4165      value.  */
4166   if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
4167     {
4168       if (dump_enabled_p ())
4169         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4170                          "reduction used in loop.\n");
4171       return NULL;
4172     }
4173
4174   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
4175      defined in the inner loop.  */
4176   if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
4177     {
4178       tree op1 = PHI_ARG_DEF (def_stmt, 0);
4179       if (gimple_phi_num_args (def_stmt) != 1
4180           || TREE_CODE (op1) != SSA_NAME)
4181         {
4182           if (dump_enabled_p ())
4183             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4184                              "unsupported phi node definition.\n");
4185
4186           return NULL;
4187         }
4188
4189       /* Verify there is an inner cycle composed of the PHI phi_use_stmt
4190          and the latch definition op1.  */
4191       gimple *def1 = SSA_NAME_DEF_STMT (op1);
4192       if (gimple_bb (def1)
4193           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4194           && loop->inner
4195           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
4196           && (is_gimple_assign (def1) || is_gimple_call (def1))
4197           && is_a <gphi *> (phi_use_stmt)
4198           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt))
4199           && (op1 == PHI_ARG_DEF_FROM_EDGE (phi_use_stmt,
4200                                             loop_latch_edge (loop->inner))))
4201         {
4202           if (dump_enabled_p ())
4203             report_vect_op (MSG_NOTE, def_stmt,
4204                             "detected double reduction: ");
4205
4206           *double_reduc = true;
4207           return def_stmt_info;
4208         }
4209
4210       return NULL;
4211     }
4212
4213   /* Look for the expression computing latch_def from then loop PHI result.  */
4214   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4215   code_helper code;
4216   if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
4217                             path))
4218     {
4219       STMT_VINFO_REDUC_CODE (phi_info) = code;
4220       if (code == COND_EXPR && !nested_in_vect_loop)
4221         STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
4222
4223       /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
4224          reduction chain for which the additional restriction is that
4225          all operations in the chain are the same.  */
4226       auto_vec<stmt_vec_info, 8> reduc_chain;
4227       unsigned i;
4228       bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
4229       for (i = path.length () - 1; i >= 1; --i)
4230         {
4231           gimple *stmt = USE_STMT (path[i].second);
4232           stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
4233           gimple_match_op op;
4234           if (!gimple_extract_op (stmt, &op))
4235             gcc_unreachable ();
4236           if (gassign *assign = dyn_cast<gassign *> (stmt))
4237             STMT_VINFO_REDUC_IDX (stmt_info)
4238               = path[i].second->use - gimple_assign_rhs1_ptr (assign);
4239           else
4240             {
4241               gcall *call = as_a<gcall *> (stmt);
4242               STMT_VINFO_REDUC_IDX (stmt_info)
4243                 = path[i].second->use - gimple_call_arg_ptr (call, 0);
4244             }
4245           bool leading_conversion = (CONVERT_EXPR_CODE_P (op.code)
4246                                      && (i == 1 || i == path.length () - 1));
4247           if ((op.code != code && !leading_conversion)
4248               /* We can only handle the final value in epilogue
4249                  generation for reduction chains.  */
4250               || (i != 1 && !has_single_use (gimple_get_lhs (stmt))))
4251             is_slp_reduc = false;
4252           /* For reduction chains we support a trailing/leading
4253              conversions.  We do not store those in the actual chain.  */
4254           if (leading_conversion)
4255             continue;
4256           reduc_chain.safe_push (stmt_info);
4257         }
4258       if (slp && is_slp_reduc && reduc_chain.length () > 1)
4259         {
4260           for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
4261             {
4262               REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
4263               REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
4264             }
4265           REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
4266           REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
4267
4268           /* Save the chain for further analysis in SLP detection.  */
4269           LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
4270           REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
4271
4272           *reduc_chain_p = true;
4273           if (dump_enabled_p ())
4274             dump_printf_loc (MSG_NOTE, vect_location,
4275                             "reduction: detected reduction chain\n");
4276         }
4277       else if (dump_enabled_p ())
4278         dump_printf_loc (MSG_NOTE, vect_location,
4279                          "reduction: detected reduction\n");
4280
4281       return def_stmt_info;
4282     }
4283
4284   if (dump_enabled_p ())
4285     dump_printf_loc (MSG_NOTE, vect_location,
4286                      "reduction: unknown pattern\n");
4287
4288   return NULL;
4289 }
4290
4291 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
4292    PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
4293    or -1 if not known.  */
4294
4295 static int
4296 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
4297 {
4298   int assumed_vf = vect_vf_for_cost (loop_vinfo);
4299   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
4300     {
4301       if (dump_enabled_p ())
4302         dump_printf_loc (MSG_NOTE, vect_location,
4303                          "cost model: epilogue peel iters set to vf/2 "
4304                          "because loop iterations are unknown .\n");
4305       return assumed_vf / 2;
4306     }
4307   else
4308     {
4309       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
4310       peel_iters_prologue = MIN (niters, peel_iters_prologue);
4311       int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
4312       /* If we need to peel for gaps, but no peeling is required, we have to
4313          peel VF iterations.  */
4314       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
4315         peel_iters_epilogue = assumed_vf;
4316       return peel_iters_epilogue;
4317     }
4318 }
4319
4320 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
4321 int
4322 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
4323                              int *peel_iters_epilogue,
4324                              stmt_vector_for_cost *scalar_cost_vec,
4325                              stmt_vector_for_cost *prologue_cost_vec,
4326                              stmt_vector_for_cost *epilogue_cost_vec)
4327 {
4328   int retval = 0;
4329
4330   *peel_iters_epilogue
4331     = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
4332
4333   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
4334     {
4335       /* If peeled iterations are known but number of scalar loop
4336          iterations are unknown, count a taken branch per peeled loop.  */
4337       if (peel_iters_prologue > 0)
4338         retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
4339                                    vect_prologue);
4340       if (*peel_iters_epilogue > 0)
4341         retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
4342                                     vect_epilogue);
4343     }
4344
4345   stmt_info_for_cost *si;
4346   int j;
4347   if (peel_iters_prologue)
4348     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4349       retval += record_stmt_cost (prologue_cost_vec,
4350                                   si->count * peel_iters_prologue,
4351                                   si->kind, si->stmt_info, si->misalign,
4352                                   vect_prologue);
4353   if (*peel_iters_epilogue)
4354     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4355       retval += record_stmt_cost (epilogue_cost_vec,
4356                                   si->count * *peel_iters_epilogue,
4357                                   si->kind, si->stmt_info, si->misalign,
4358                                   vect_epilogue);
4359
4360   return retval;
4361 }
4362
4363 /* Function vect_estimate_min_profitable_iters
4364
4365    Return the number of iterations required for the vector version of the
4366    loop to be profitable relative to the cost of the scalar version of the
4367    loop.
4368
4369    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
4370    of iterations for vectorization.  -1 value means loop vectorization
4371    is not profitable.  This returned value may be used for dynamic
4372    profitability check.
4373
4374    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
4375    for static check against estimated number of iterations.  */
4376
4377 static void
4378 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
4379                                     int *ret_min_profitable_niters,
4380                                     int *ret_min_profitable_estimate,
4381                                     unsigned *suggested_unroll_factor)
4382 {
4383   int min_profitable_iters;
4384   int min_profitable_estimate;
4385   int peel_iters_prologue;
4386   int peel_iters_epilogue;
4387   unsigned vec_inside_cost = 0;
4388   int vec_outside_cost = 0;
4389   unsigned vec_prologue_cost = 0;
4390   unsigned vec_epilogue_cost = 0;
4391   int scalar_single_iter_cost = 0;
4392   int scalar_outside_cost = 0;
4393   int assumed_vf = vect_vf_for_cost (loop_vinfo);
4394   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
4395   vector_costs *target_cost_data = loop_vinfo->vector_costs;
4396
4397   /* Cost model disabled.  */
4398   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
4399     {
4400       if (dump_enabled_p ())
4401         dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
4402       *ret_min_profitable_niters = 0;
4403       *ret_min_profitable_estimate = 0;
4404       return;
4405     }
4406
4407   /* Requires loop versioning tests to handle misalignment.  */
4408   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
4409     {
4410       /*  FIXME: Make cost depend on complexity of individual check.  */
4411       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
4412       (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4413       if (dump_enabled_p ())
4414         dump_printf (MSG_NOTE,
4415                      "cost model: Adding cost of checks for loop "
4416                      "versioning to treat misalignment.\n");
4417     }
4418
4419   /* Requires loop versioning with alias checks.  */
4420   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4421     {
4422       /*  FIXME: Make cost depend on complexity of individual check.  */
4423       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
4424       (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4425       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
4426       if (len)
4427         /* Count LEN - 1 ANDs and LEN comparisons.  */
4428         (void) add_stmt_cost (target_cost_data, len * 2 - 1,
4429                               scalar_stmt, vect_prologue);
4430       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
4431       if (len)
4432         {
4433           /* Count LEN - 1 ANDs and LEN comparisons.  */
4434           unsigned int nstmts = len * 2 - 1;
4435           /* +1 for each bias that needs adding.  */
4436           for (unsigned int i = 0; i < len; ++i)
4437             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
4438               nstmts += 1;
4439           (void) add_stmt_cost (target_cost_data, nstmts,
4440                                 scalar_stmt, vect_prologue);
4441         }
4442       if (dump_enabled_p ())
4443         dump_printf (MSG_NOTE,
4444                      "cost model: Adding cost of checks for loop "
4445                      "versioning aliasing.\n");
4446     }
4447
4448   /* Requires loop versioning with niter checks.  */
4449   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
4450     {
4451       /*  FIXME: Make cost depend on complexity of individual check.  */
4452       (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
4453                             NULL, NULL, NULL_TREE, 0, vect_prologue);
4454       if (dump_enabled_p ())
4455         dump_printf (MSG_NOTE,
4456                      "cost model: Adding cost of checks for loop "
4457                      "versioning niters.\n");
4458     }
4459
4460   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4461     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4462                           vect_prologue);
4463
4464   /* Count statements in scalar loop.  Using this as scalar cost for a single
4465      iteration for now.
4466
4467      TODO: Add outer loop support.
4468
4469      TODO: Consider assigning different costs to different scalar
4470      statements.  */
4471
4472   scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost ();
4473
4474   /* Add additional cost for the peeled instructions in prologue and epilogue
4475      loop.  (For fully-masked loops there will be no peeling.)
4476
4477      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4478      at compile-time - we assume it's vf/2 (the worst would be vf-1).
4479
4480      TODO: Build an expression that represents peel_iters for prologue and
4481      epilogue to be used in a run-time test.  */
4482
4483   bool prologue_need_br_taken_cost = false;
4484   bool prologue_need_br_not_taken_cost = false;
4485
4486   /* Calculate peel_iters_prologue.  */
4487   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4488     peel_iters_prologue = 0;
4489   else if (npeel < 0)
4490     {
4491       peel_iters_prologue = assumed_vf / 2;
4492       if (dump_enabled_p ())
4493         dump_printf (MSG_NOTE, "cost model: "
4494                      "prologue peel iters set to vf/2.\n");
4495
4496       /* If peeled iterations are unknown, count a taken branch and a not taken
4497          branch per peeled loop.  Even if scalar loop iterations are known,
4498          vector iterations are not known since peeled prologue iterations are
4499          not known.  Hence guards remain the same.  */
4500       prologue_need_br_taken_cost = true;
4501       prologue_need_br_not_taken_cost = true;
4502     }
4503   else
4504     {
4505       peel_iters_prologue = npeel;
4506       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4507         /* If peeled iterations are known but number of scalar loop
4508            iterations are unknown, count a taken branch per peeled loop.  */
4509         prologue_need_br_taken_cost = true;
4510     }
4511
4512   bool epilogue_need_br_taken_cost = false;
4513   bool epilogue_need_br_not_taken_cost = false;
4514
4515   /* Calculate peel_iters_epilogue.  */
4516   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4517     /* We need to peel exactly one iteration for gaps.  */
4518     peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4519   else if (npeel < 0)
4520     {
4521       /* If peeling for alignment is unknown, loop bound of main loop
4522          becomes unknown.  */
4523       peel_iters_epilogue = assumed_vf / 2;
4524       if (dump_enabled_p ())
4525         dump_printf (MSG_NOTE, "cost model: "
4526                      "epilogue peel iters set to vf/2 because "
4527                      "peeling for alignment is unknown.\n");
4528
4529       /* See the same reason above in peel_iters_prologue calculation.  */
4530       epilogue_need_br_taken_cost = true;
4531       epilogue_need_br_not_taken_cost = true;
4532     }
4533   else
4534     {
4535       peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4536       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4537         /* If peeled iterations are known but number of scalar loop
4538            iterations are unknown, count a taken branch per peeled loop.  */
4539         epilogue_need_br_taken_cost = true;
4540     }
4541
4542   stmt_info_for_cost *si;
4543   int j;
4544   /* Add costs associated with peel_iters_prologue.  */
4545   if (peel_iters_prologue)
4546     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4547       {
4548         (void) add_stmt_cost (target_cost_data,
4549                               si->count * peel_iters_prologue, si->kind,
4550                               si->stmt_info, si->node, si->vectype,
4551                               si->misalign, vect_prologue);
4552       }
4553
4554   /* Add costs associated with peel_iters_epilogue.  */
4555   if (peel_iters_epilogue)
4556     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4557       {
4558         (void) add_stmt_cost (target_cost_data,
4559                               si->count * peel_iters_epilogue, si->kind,
4560                               si->stmt_info, si->node, si->vectype,
4561                               si->misalign, vect_epilogue);
4562       }
4563
4564   /* Add possible cond_branch_taken/cond_branch_not_taken cost.  */
4565
4566   if (prologue_need_br_taken_cost)
4567     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4568                           vect_prologue);
4569
4570   if (prologue_need_br_not_taken_cost)
4571     (void) add_stmt_cost (target_cost_data, 1,
4572                           cond_branch_not_taken, vect_prologue);
4573
4574   if (epilogue_need_br_taken_cost)
4575     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4576                           vect_epilogue);
4577
4578   if (epilogue_need_br_not_taken_cost)
4579     (void) add_stmt_cost (target_cost_data, 1,
4580                           cond_branch_not_taken, vect_epilogue);
4581
4582   /* Take care of special costs for rgroup controls of partial vectors.  */
4583   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4584       && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4585           == vect_partial_vectors_avx512))
4586     {
4587       /* Calculate how many masks we need to generate.  */
4588       unsigned int num_masks = 0;
4589       bool need_saturation = false;
4590       for (auto rgm : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
4591         if (rgm.type)
4592           {
4593             unsigned nvectors = rgm.factor;
4594             num_masks += nvectors;
4595             if (TYPE_PRECISION (TREE_TYPE (rgm.compare_type))
4596                 < TYPE_PRECISION (LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo)))
4597               need_saturation = true;
4598           }
4599
4600       /* ???  The target isn't able to identify the costs below as
4601          producing masks so it cannot penaltize cases where we'd run
4602          out of mask registers for example.  */
4603
4604       /* ???  We are also failing to account for smaller vector masks
4605          we generate by splitting larger masks in vect_get_loop_mask.  */
4606
4607       /* In the worst case, we need to generate each mask in the prologue
4608          and in the loop body.  We need one splat per group and one
4609          compare per mask.
4610
4611          Sometimes the prologue mask will fold to a constant,
4612          so the actual prologue cost might be smaller.  However, it's
4613          simpler and safer to use the worst-case cost; if this ends up
4614          being the tie-breaker between vectorizing or not, then it's
4615          probably better not to vectorize.  */
4616       (void) add_stmt_cost (target_cost_data,
4617                             num_masks
4618                             + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4619                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4620                             vect_prologue);
4621       (void) add_stmt_cost (target_cost_data,
4622                             num_masks
4623                             + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4624                             vector_stmt, NULL, NULL, NULL_TREE, 0, vect_body);
4625
4626       /* When we need saturation we need it both in the prologue and
4627          the epilogue.  */
4628       if (need_saturation)
4629         {
4630           (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4631                                 NULL, NULL, NULL_TREE, 0, vect_prologue);
4632           (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4633                                 NULL, NULL, NULL_TREE, 0, vect_body);
4634         }
4635     }
4636   else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4637            && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4638                == vect_partial_vectors_while_ult))
4639     {
4640       /* Calculate how many masks we need to generate.  */
4641       unsigned int num_masks = 0;
4642       rgroup_controls *rgm;
4643       unsigned int num_vectors_m1;
4644       FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec,
4645                         num_vectors_m1, rgm)
4646         if (rgm->type)
4647           num_masks += num_vectors_m1 + 1;
4648       gcc_assert (num_masks > 0);
4649
4650       /* In the worst case, we need to generate each mask in the prologue
4651          and in the loop body.  One of the loop body mask instructions
4652          replaces the comparison in the scalar loop, and since we don't
4653          count the scalar comparison against the scalar body, we shouldn't
4654          count that vector instruction against the vector body either.
4655
4656          Sometimes we can use unpacks instead of generating prologue
4657          masks and sometimes the prologue mask will fold to a constant,
4658          so the actual prologue cost might be smaller.  However, it's
4659          simpler and safer to use the worst-case cost; if this ends up
4660          being the tie-breaker between vectorizing or not, then it's
4661          probably better not to vectorize.  */
4662       (void) add_stmt_cost (target_cost_data, num_masks,
4663                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4664                             vect_prologue);
4665       (void) add_stmt_cost (target_cost_data, num_masks - 1,
4666                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4667                             vect_body);
4668     }
4669   else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4670     {
4671       /* Referring to the functions vect_set_loop_condition_partial_vectors
4672          and vect_set_loop_controls_directly, we need to generate each
4673          length in the prologue and in the loop body if required. Although
4674          there are some possible optimizations, we consider the worst case
4675          here.  */
4676
4677       bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4678       signed char partial_load_store_bias
4679         = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4680       bool need_iterate_p
4681         = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4682            && !vect_known_niters_smaller_than_vf (loop_vinfo));
4683
4684       /* Calculate how many statements to be added.  */
4685       unsigned int prologue_stmts = 0;
4686       unsigned int body_stmts = 0;
4687
4688       rgroup_controls *rgc;
4689       unsigned int num_vectors_m1;
4690       FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4691         if (rgc->type)
4692           {
4693             /* May need one SHIFT for nitems_total computation.  */
4694             unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4695             if (nitems != 1 && !niters_known_p)
4696               prologue_stmts += 1;
4697
4698             /* May need one MAX and one MINUS for wrap around.  */
4699             if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4700               prologue_stmts += 2;
4701
4702             /* Need one MAX and one MINUS for each batch limit excepting for
4703                the 1st one.  */
4704             prologue_stmts += num_vectors_m1 * 2;
4705
4706             unsigned int num_vectors = num_vectors_m1 + 1;
4707
4708             /* Need to set up lengths in prologue, only one MIN required
4709                for each since start index is zero.  */
4710             prologue_stmts += num_vectors;
4711
4712             /* If we have a non-zero partial load bias, we need one PLUS
4713                to adjust the load length.  */
4714             if (partial_load_store_bias != 0)
4715               body_stmts += 1;
4716
4717             /* Each may need two MINs and one MINUS to update lengths in body
4718                for next iteration.  */
4719             if (need_iterate_p)
4720               body_stmts += 3 * num_vectors;
4721           }
4722
4723       (void) add_stmt_cost (target_cost_data, prologue_stmts,
4724                             scalar_stmt, vect_prologue);
4725       (void) add_stmt_cost (target_cost_data, body_stmts,
4726                             scalar_stmt, vect_body);
4727     }
4728
4729   /* FORNOW: The scalar outside cost is incremented in one of the
4730      following ways:
4731
4732      1. The vectorizer checks for alignment and aliasing and generates
4733      a condition that allows dynamic vectorization.  A cost model
4734      check is ANDED with the versioning condition.  Hence scalar code
4735      path now has the added cost of the versioning check.
4736
4737        if (cost > th & versioning_check)
4738          jmp to vector code
4739
4740      Hence run-time scalar is incremented by not-taken branch cost.
4741
4742      2. The vectorizer then checks if a prologue is required.  If the
4743      cost model check was not done before during versioning, it has to
4744      be done before the prologue check.
4745
4746        if (cost <= th)
4747          prologue = scalar_iters
4748        if (prologue == 0)
4749          jmp to vector code
4750        else
4751          execute prologue
4752        if (prologue == num_iters)
4753          go to exit
4754
4755      Hence the run-time scalar cost is incremented by a taken branch,
4756      plus a not-taken branch, plus a taken branch cost.
4757
4758      3. The vectorizer then checks if an epilogue is required.  If the
4759      cost model check was not done before during prologue check, it
4760      has to be done with the epilogue check.
4761
4762        if (prologue == 0)
4763          jmp to vector code
4764        else
4765          execute prologue
4766        if (prologue == num_iters)
4767          go to exit
4768        vector code:
4769          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4770            jmp to epilogue
4771
4772      Hence the run-time scalar cost should be incremented by 2 taken
4773      branches.
4774
4775      TODO: The back end may reorder the BBS's differently and reverse
4776      conditions/branch directions.  Change the estimates below to
4777      something more reasonable.  */
4778
4779   /* If the number of iterations is known and we do not do versioning, we can
4780      decide whether to vectorize at compile time.  Hence the scalar version
4781      do not carry cost model guard costs.  */
4782   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4783       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4784     {
4785       /* Cost model check occurs at versioning.  */
4786       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4787         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4788       else
4789         {
4790           /* Cost model check occurs at prologue generation.  */
4791           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4792             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4793               + vect_get_stmt_cost (cond_branch_not_taken);
4794           /* Cost model check occurs at epilogue generation.  */
4795           else
4796             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4797         }
4798     }
4799
4800   /* Complete the target-specific cost calculations.  */
4801   finish_cost (loop_vinfo->vector_costs, loop_vinfo->scalar_costs,
4802                &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost,
4803                suggested_unroll_factor);
4804
4805   if (suggested_unroll_factor && *suggested_unroll_factor > 1
4806       && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
4807       && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
4808                     *suggested_unroll_factor,
4809                     LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
4810     {
4811       if (dump_enabled_p ())
4812         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4813                          "can't unroll as unrolled vectorization factor larger"
4814                          " than maximum vectorization factor: "
4815                          HOST_WIDE_INT_PRINT_UNSIGNED "\n",
4816                          LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
4817       *suggested_unroll_factor = 1;
4818     }
4819
4820   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4821
4822   if (dump_enabled_p ())
4823     {
4824       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4825       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
4826                    vec_inside_cost);
4827       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
4828                    vec_prologue_cost);
4829       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
4830                    vec_epilogue_cost);
4831       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
4832                    scalar_single_iter_cost);
4833       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
4834                    scalar_outside_cost);
4835       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
4836                    vec_outside_cost);
4837       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
4838                    peel_iters_prologue);
4839       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
4840                    peel_iters_epilogue);
4841     }
4842
4843   /* Calculate number of iterations required to make the vector version
4844      profitable, relative to the loop bodies only.  The following condition
4845      must hold true:
4846      SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4847      where
4848      SIC = scalar iteration cost, VIC = vector iteration cost,
4849      VOC = vector outside cost, VF = vectorization factor,
4850      NPEEL = prologue iterations + epilogue iterations,
4851      SOC = scalar outside cost for run time cost model check.  */
4852
4853   int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4854                           - vec_inside_cost);
4855   if (saving_per_viter <= 0)
4856     {
4857       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4858         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4859                     "vectorization did not happen for a simd loop");
4860
4861       if (dump_enabled_p ())
4862         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4863                          "cost model: the vector iteration cost = %d "
4864                          "divided by the scalar iteration cost = %d "
4865                          "is greater or equal to the vectorization factor = %d"
4866                          ".\n",
4867                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4868       *ret_min_profitable_niters = -1;
4869       *ret_min_profitable_estimate = -1;
4870       return;
4871     }
4872
4873   /* ??? The "if" arm is written to handle all cases; see below for what
4874      we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
4875   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4876     {
4877       /* Rewriting the condition above in terms of the number of
4878          vector iterations (vniters) rather than the number of
4879          scalar iterations (niters) gives:
4880
4881          SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
4882
4883          <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
4884
4885          For integer N, X and Y when X > 0:
4886
4887          N * X > Y <==> N >= (Y /[floor] X) + 1.  */
4888       int outside_overhead = (vec_outside_cost
4889                               - scalar_single_iter_cost * peel_iters_prologue
4890                               - scalar_single_iter_cost * peel_iters_epilogue
4891                               - scalar_outside_cost);
4892       /* We're only interested in cases that require at least one
4893          vector iteration.  */
4894       int min_vec_niters = 1;
4895       if (outside_overhead > 0)
4896         min_vec_niters = outside_overhead / saving_per_viter + 1;
4897
4898       if (dump_enabled_p ())
4899         dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
4900                      min_vec_niters);
4901
4902       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4903         {
4904           /* Now that we know the minimum number of vector iterations,
4905              find the minimum niters for which the scalar cost is larger:
4906
4907              SIC * niters > VIC * vniters + VOC - SOC
4908
4909              We know that the minimum niters is no more than
4910              vniters * VF + NPEEL, but it might be (and often is) less
4911              than that if a partial vector iteration is cheaper than the
4912              equivalent scalar code.  */
4913           int threshold = (vec_inside_cost * min_vec_niters
4914                            + vec_outside_cost
4915                            - scalar_outside_cost);
4916           if (threshold <= 0)
4917             min_profitable_iters = 1;
4918           else
4919             min_profitable_iters = threshold / scalar_single_iter_cost + 1;
4920         }
4921       else
4922         /* Convert the number of vector iterations into a number of
4923            scalar iterations.  */
4924         min_profitable_iters = (min_vec_niters * assumed_vf
4925                                 + peel_iters_prologue
4926                                 + peel_iters_epilogue);
4927     }
4928   else
4929     {
4930       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4931                               * assumed_vf
4932                               - vec_inside_cost * peel_iters_prologue
4933                               - vec_inside_cost * peel_iters_epilogue);
4934       if (min_profitable_iters <= 0)
4935         min_profitable_iters = 0;
4936       else
4937         {
4938           min_profitable_iters /= saving_per_viter;
4939
4940           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4941               <= (((int) vec_inside_cost * min_profitable_iters)
4942                   + (((int) vec_outside_cost - scalar_outside_cost)
4943                      * assumed_vf)))
4944             min_profitable_iters++;
4945         }
4946     }
4947
4948   if (dump_enabled_p ())
4949     dump_printf (MSG_NOTE,
4950                  "  Calculated minimum iters for profitability: %d\n",
4951                  min_profitable_iters);
4952
4953   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
4954       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4955     /* We want the vectorized loop to execute at least once.  */
4956     min_profitable_iters = assumed_vf + peel_iters_prologue;
4957   else if (min_profitable_iters < peel_iters_prologue)
4958     /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4959        vectorized loop executes at least once.  */
4960     min_profitable_iters = peel_iters_prologue;
4961
4962   if (dump_enabled_p ())
4963     dump_printf_loc (MSG_NOTE, vect_location,
4964                      "  Runtime profitability threshold = %d\n",
4965                      min_profitable_iters);
4966
4967   *ret_min_profitable_niters = min_profitable_iters;
4968
4969   /* Calculate number of iterations required to make the vector version
4970      profitable, relative to the loop bodies only.
4971
4972      Non-vectorized variant is SIC * niters and it must win over vector
4973      variant on the expected loop trip count.  The following condition must hold true:
4974      SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
4975
4976   if (vec_outside_cost <= 0)
4977     min_profitable_estimate = 0;
4978   /* ??? This "else if" arm is written to handle all cases; see below for
4979      what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
4980   else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4981     {
4982       /* This is a repeat of the code above, but with + SOC rather
4983          than - SOC.  */
4984       int outside_overhead = (vec_outside_cost
4985                               - scalar_single_iter_cost * peel_iters_prologue
4986                               - scalar_single_iter_cost * peel_iters_epilogue
4987                               + scalar_outside_cost);
4988       int min_vec_niters = 1;
4989       if (outside_overhead > 0)
4990         min_vec_niters = outside_overhead / saving_per_viter + 1;
4991
4992       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4993         {
4994           int threshold = (vec_inside_cost * min_vec_niters
4995                            + vec_outside_cost
4996                            + scalar_outside_cost);
4997           min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
4998         }
4999       else
5000         min_profitable_estimate = (min_vec_niters * assumed_vf
5001                                    + peel_iters_prologue
5002                                    + peel_iters_epilogue);
5003     }
5004   else
5005     {
5006       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
5007                                  * assumed_vf
5008                                  - vec_inside_cost * peel_iters_prologue
5009                                  - vec_inside_cost * peel_iters_epilogue)
5010                                  / ((scalar_single_iter_cost * assumed_vf)
5011                                    - vec_inside_cost);
5012     }
5013   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
5014   if (dump_enabled_p ())
5015     dump_printf_loc (MSG_NOTE, vect_location,
5016                      "  Static estimate profitability threshold = %d\n",
5017                      min_profitable_estimate);
5018
5019   *ret_min_profitable_estimate = min_profitable_estimate;
5020 }
5021
5022 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
5023    vector elements (not bits) for a vector with NELT elements.  */
5024 static void
5025 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
5026                               vec_perm_builder *sel)
5027 {
5028   /* The encoding is a single stepped pattern.  Any wrap-around is handled
5029      by vec_perm_indices.  */
5030   sel->new_vector (nelt, 1, 3);
5031   for (unsigned int i = 0; i < 3; i++)
5032     sel->quick_push (i + offset);
5033 }
5034
5035 /* Checks whether the target supports whole-vector shifts for vectors of mode
5036    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
5037    it supports vec_perm_const with masks for all necessary shift amounts.  */
5038 static bool
5039 have_whole_vector_shift (machine_mode mode)
5040 {
5041   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
5042     return true;
5043
5044   /* Variable-length vectors should be handled via the optab.  */
5045   unsigned int nelt;
5046   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5047     return false;
5048
5049   vec_perm_builder sel;
5050   vec_perm_indices indices;
5051   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
5052     {
5053       calc_vec_perm_mask_for_shift (i, nelt, &sel);
5054       indices.new_vector (sel, 2, nelt);
5055       if (!can_vec_perm_const_p (mode, mode, indices, false))
5056         return false;
5057     }
5058   return true;
5059 }
5060
5061 /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
5062    multiplication operands have differing signs and (b) we intend
5063    to emulate the operation using a series of signed DOT_PROD_EXPRs.
5064    See vect_emulate_mixed_dot_prod for the actual sequence used.  */
5065
5066 static bool
5067 vect_is_emulated_mixed_dot_prod (loop_vec_info loop_vinfo,
5068                                  stmt_vec_info stmt_info)
5069 {
5070   gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
5071   if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
5072     return false;
5073
5074   tree rhs1 = gimple_assign_rhs1 (assign);
5075   tree rhs2 = gimple_assign_rhs2 (assign);
5076   if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
5077     return false;
5078
5079   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5080   gcc_assert (reduc_info->is_reduc_info);
5081   return !directly_supported_p (DOT_PROD_EXPR,
5082                                 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info),
5083                                 optab_vector_mixed_sign);
5084 }
5085
5086 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
5087    functions. Design better to avoid maintenance issues.  */
5088
5089 /* Function vect_model_reduction_cost.
5090
5091    Models cost for a reduction operation, including the vector ops
5092    generated within the strip-mine loop in some cases, the initial
5093    definition before the loop, and the epilogue code that must be generated.  */
5094
5095 static void
5096 vect_model_reduction_cost (loop_vec_info loop_vinfo,
5097                            stmt_vec_info stmt_info, internal_fn reduc_fn,
5098                            vect_reduction_type reduction_type,
5099                            int ncopies, stmt_vector_for_cost *cost_vec)
5100 {
5101   int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
5102   tree vectype;
5103   machine_mode mode;
5104   class loop *loop = NULL;
5105
5106   if (loop_vinfo)
5107     loop = LOOP_VINFO_LOOP (loop_vinfo);
5108
5109   /* Condition reductions generate two reductions in the loop.  */
5110   if (reduction_type == COND_REDUCTION)
5111     ncopies *= 2;
5112
5113   vectype = STMT_VINFO_VECTYPE (stmt_info);
5114   mode = TYPE_MODE (vectype);
5115   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5116
5117   gimple_match_op op;
5118   if (!gimple_extract_op (orig_stmt_info->stmt, &op))
5119     gcc_unreachable ();
5120
5121   bool emulated_mixed_dot_prod
5122     = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
5123   if (reduction_type == EXTRACT_LAST_REDUCTION)
5124     /* No extra instructions are needed in the prologue.  The loop body
5125        operations are costed in vectorizable_condition.  */
5126     inside_cost = 0;
5127   else if (reduction_type == FOLD_LEFT_REDUCTION)
5128     {
5129       /* No extra instructions needed in the prologue.  */
5130       prologue_cost = 0;
5131
5132       if (reduc_fn != IFN_LAST)
5133         /* Count one reduction-like operation per vector.  */
5134         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
5135                                         stmt_info, 0, vect_body);
5136       else
5137         {
5138           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
5139           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
5140           inside_cost = record_stmt_cost (cost_vec, nelements,
5141                                           vec_to_scalar, stmt_info, 0,
5142                                           vect_body);
5143           inside_cost += record_stmt_cost (cost_vec, nelements,
5144                                            scalar_stmt, stmt_info, 0,
5145                                            vect_body);
5146         }
5147     }
5148   else
5149     {
5150       /* Add in the cost of the initial definitions.  */
5151       int prologue_stmts;
5152       if (reduction_type == COND_REDUCTION)
5153         /* For cond reductions we have four vectors: initial index, step,
5154            initial result of the data reduction, initial value of the index
5155            reduction.  */
5156         prologue_stmts = 4;
5157       else if (emulated_mixed_dot_prod)
5158         /* We need the initial reduction value and two invariants:
5159            one that contains the minimum signed value and one that
5160            contains half of its negative.  */
5161         prologue_stmts = 3;
5162       else
5163         prologue_stmts = 1;
5164       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
5165                                          scalar_to_vec, stmt_info, 0,
5166                                          vect_prologue);
5167     }
5168
5169   /* Determine cost of epilogue code.
5170
5171      We have a reduction operator that will reduce the vector in one statement.
5172      Also requires scalar extract.  */
5173
5174   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
5175     {
5176       if (reduc_fn != IFN_LAST)
5177         {
5178           if (reduction_type == COND_REDUCTION)
5179             {
5180               /* An EQ stmt and an COND_EXPR stmt.  */
5181               epilogue_cost += record_stmt_cost (cost_vec, 2,
5182                                                  vector_stmt, stmt_info, 0,
5183                                                  vect_epilogue);
5184               /* Reduction of the max index and a reduction of the found
5185                  values.  */
5186               epilogue_cost += record_stmt_cost (cost_vec, 2,
5187                                                  vec_to_scalar, stmt_info, 0,
5188                                                  vect_epilogue);
5189               /* A broadcast of the max value.  */
5190               epilogue_cost += record_stmt_cost (cost_vec, 1,
5191                                                  scalar_to_vec, stmt_info, 0,
5192                                                  vect_epilogue);
5193             }
5194           else
5195             {
5196               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
5197                                                  stmt_info, 0, vect_epilogue);
5198               epilogue_cost += record_stmt_cost (cost_vec, 1,
5199                                                  vec_to_scalar, stmt_info, 0,
5200                                                  vect_epilogue);
5201             }
5202         }
5203       else if (reduction_type == COND_REDUCTION)
5204         {
5205           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
5206           /* Extraction of scalar elements.  */
5207           epilogue_cost += record_stmt_cost (cost_vec,
5208                                              2 * estimated_nunits,
5209                                              vec_to_scalar, stmt_info, 0,
5210                                              vect_epilogue);
5211           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
5212           epilogue_cost += record_stmt_cost (cost_vec,
5213                                              2 * estimated_nunits - 3,
5214                                              scalar_stmt, stmt_info, 0,
5215                                              vect_epilogue);
5216         }
5217       else if (reduction_type == EXTRACT_LAST_REDUCTION
5218                || reduction_type == FOLD_LEFT_REDUCTION)
5219         /* No extra instructions need in the epilogue.  */
5220         ;
5221       else
5222         {
5223           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5224           tree bitsize = TYPE_SIZE (op.type);
5225           int element_bitsize = tree_to_uhwi (bitsize);
5226           int nelements = vec_size_in_bits / element_bitsize;
5227
5228           if (op.code == COND_EXPR)
5229             op.code = MAX_EXPR;
5230
5231           /* We have a whole vector shift available.  */
5232           if (VECTOR_MODE_P (mode)
5233               && directly_supported_p (op.code, vectype)
5234               && have_whole_vector_shift (mode))
5235             {
5236               /* Final reduction via vector shifts and the reduction operator.
5237                  Also requires scalar extract.  */
5238               epilogue_cost += record_stmt_cost (cost_vec,
5239                                                  exact_log2 (nelements) * 2,
5240                                                  vector_stmt, stmt_info, 0,
5241                                                  vect_epilogue);
5242               epilogue_cost += record_stmt_cost (cost_vec, 1,
5243                                                  vec_to_scalar, stmt_info, 0,
5244                                                  vect_epilogue);
5245             }
5246           else
5247             /* Use extracts and reduction op for final reduction.  For N
5248                elements, we have N extracts and N-1 reduction ops.  */
5249             epilogue_cost += record_stmt_cost (cost_vec,
5250                                                nelements + nelements - 1,
5251                                                vector_stmt, stmt_info, 0,
5252                                                vect_epilogue);
5253         }
5254     }
5255
5256   if (dump_enabled_p ())
5257     dump_printf (MSG_NOTE,
5258                  "vect_model_reduction_cost: inside_cost = %d, "
5259                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
5260                  prologue_cost, epilogue_cost);
5261 }
5262
5263 /* SEQ is a sequence of instructions that initialize the reduction
5264    described by REDUC_INFO.  Emit them in the appropriate place.  */
5265
5266 static void
5267 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
5268                                 stmt_vec_info reduc_info, gimple *seq)
5269 {
5270   if (reduc_info->reused_accumulator)
5271     {
5272       /* When reusing an accumulator from the main loop, we only need
5273          initialization instructions if the main loop can be skipped.
5274          In that case, emit the initialization instructions at the end
5275          of the guard block that does the skip.  */
5276       edge skip_edge = loop_vinfo->skip_main_loop_edge;
5277       gcc_assert (skip_edge);
5278       gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
5279       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
5280     }
5281   else
5282     {
5283       /* The normal case: emit the initialization instructions on the
5284          preheader edge.  */
5285       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5286       gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
5287     }
5288 }
5289
5290 /* Function get_initial_def_for_reduction
5291
5292    Input:
5293    REDUC_INFO - the info_for_reduction
5294    INIT_VAL - the initial value of the reduction variable
5295    NEUTRAL_OP - a value that has no effect on the reduction, as per
5296                 neutral_op_for_reduction
5297
5298    Output:
5299    Return a vector variable, initialized according to the operation that
5300         STMT_VINFO performs. This vector will be used as the initial value
5301         of the vector of partial results.
5302
5303    The value we need is a vector in which element 0 has value INIT_VAL
5304    and every other element has value NEUTRAL_OP.  */
5305
5306 static tree
5307 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
5308                                stmt_vec_info reduc_info,
5309                                tree init_val, tree neutral_op)
5310 {
5311   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5312   tree scalar_type = TREE_TYPE (init_val);
5313   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
5314   tree init_def;
5315   gimple_seq stmts = NULL;
5316
5317   gcc_assert (vectype);
5318
5319   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
5320               || SCALAR_FLOAT_TYPE_P (scalar_type));
5321
5322   gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
5323               || loop == (gimple_bb (reduc_info->stmt))->loop_father);
5324
5325   if (operand_equal_p (init_val, neutral_op))
5326     {
5327       /* If both elements are equal then the vector described above is
5328          just a splat.  */
5329       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5330       init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
5331     }
5332   else
5333     {
5334       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5335       init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
5336       if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
5337         {
5338           /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
5339              element 0.  */
5340           init_def = gimple_build_vector_from_val (&stmts, vectype,
5341                                                    neutral_op);
5342           init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
5343                                    vectype, init_def, init_val);
5344         }
5345       else
5346         {
5347           /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}.  */
5348           tree_vector_builder elts (vectype, 1, 2);
5349           elts.quick_push (init_val);
5350           elts.quick_push (neutral_op);
5351           init_def = gimple_build_vector (&stmts, &elts);
5352         }
5353     }
5354
5355   if (stmts)
5356     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
5357   return init_def;
5358 }
5359
5360 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
5361    which performs a reduction involving GROUP_SIZE scalar statements.
5362    NUMBER_OF_VECTORS is the number of vector defs to create.  If NEUTRAL_OP
5363    is nonnull, introducing extra elements of that value will not change the
5364    result.  */
5365
5366 static void
5367 get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
5368                                 stmt_vec_info reduc_info,
5369                                 vec<tree> *vec_oprnds,
5370                                 unsigned int number_of_vectors,
5371                                 unsigned int group_size, tree neutral_op)
5372 {
5373   vec<tree> &initial_values = reduc_info->reduc_initial_values;
5374   unsigned HOST_WIDE_INT nunits;
5375   unsigned j, number_of_places_left_in_vector;
5376   tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
5377   unsigned int i;
5378
5379   gcc_assert (group_size == initial_values.length () || neutral_op);
5380
5381   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5382      created vectors. It is greater than 1 if unrolling is performed.
5383
5384      For example, we have two scalar operands, s1 and s2 (e.g., group of
5385      strided accesses of size two), while NUNITS is four (i.e., four scalars
5386      of this type can be packed in a vector).  The output vector will contain
5387      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
5388      will be 2).
5389
5390      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
5391      vectors containing the operands.
5392
5393      For example, NUNITS is four as before, and the group size is 8
5394      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
5395      {s5, s6, s7, s8}.  */
5396
5397   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
5398     nunits = group_size;
5399
5400   number_of_places_left_in_vector = nunits;
5401   bool constant_p = true;
5402   tree_vector_builder elts (vector_type, nunits, 1);
5403   elts.quick_grow (nunits);
5404   gimple_seq ctor_seq = NULL;
5405   for (j = 0; j < nunits * number_of_vectors; ++j)
5406     {
5407       tree op;
5408       i = j % group_size;
5409
5410       /* Get the def before the loop.  In reduction chain we have only
5411          one initial value.  Else we have as many as PHIs in the group.  */
5412       if (i >= initial_values.length () || (j > i && neutral_op))
5413         op = neutral_op;
5414       else
5415         op = initial_values[i];
5416
5417       /* Create 'vect_ = {op0,op1,...,opn}'.  */
5418       number_of_places_left_in_vector--;
5419       elts[nunits - number_of_places_left_in_vector - 1] = op;
5420       if (!CONSTANT_CLASS_P (op))
5421         constant_p = false;
5422
5423       if (number_of_places_left_in_vector == 0)
5424         {
5425           tree init;
5426           if (constant_p && !neutral_op
5427               ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
5428               : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
5429             /* Build the vector directly from ELTS.  */
5430             init = gimple_build_vector (&ctor_seq, &elts);
5431           else if (neutral_op)
5432             {
5433               /* Build a vector of the neutral value and shift the
5434                  other elements into place.  */
5435               init = gimple_build_vector_from_val (&ctor_seq, vector_type,
5436                                                    neutral_op);
5437               int k = nunits;
5438               while (k > 0 && elts[k - 1] == neutral_op)
5439                 k -= 1;
5440               while (k > 0)
5441                 {
5442                   k -= 1;
5443                   init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
5444                                        vector_type, init, elts[k]);
5445                 }
5446             }
5447           else
5448             {
5449               /* First time round, duplicate ELTS to fill the
5450                  required number of vectors.  */
5451               duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
5452                                         elts, number_of_vectors, *vec_oprnds);
5453               break;
5454             }
5455           vec_oprnds->quick_push (init);
5456
5457           number_of_places_left_in_vector = nunits;
5458           elts.new_vector (vector_type, nunits, 1);
5459           elts.quick_grow (nunits);
5460           constant_p = true;
5461         }
5462     }
5463   if (ctor_seq != NULL)
5464     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
5465 }
5466
5467 /* For a statement STMT_INFO taking part in a reduction operation return
5468    the stmt_vec_info the meta information is stored on.  */
5469
5470 stmt_vec_info
5471 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
5472 {
5473   stmt_info = vect_orig_stmt (stmt_info);
5474   gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
5475   if (!is_a <gphi *> (stmt_info->stmt)
5476       || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
5477     stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5478   gphi *phi = as_a <gphi *> (stmt_info->stmt);
5479   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5480     {
5481       if (gimple_phi_num_args (phi) == 1)
5482         stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5483     }
5484   else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
5485     {
5486       stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
5487       if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
5488         stmt_info = info;
5489     }
5490   return stmt_info;
5491 }
5492
5493 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
5494    REDUC_INFO can build on.  Adjust REDUC_INFO and return true if so, otherwise
5495    return false.  */
5496
5497 static bool
5498 vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
5499                                 stmt_vec_info reduc_info)
5500 {
5501   loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
5502   if (!main_loop_vinfo)
5503     return false;
5504
5505   if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
5506     return false;
5507
5508   unsigned int num_phis = reduc_info->reduc_initial_values.length ();
5509   auto_vec<tree, 16> main_loop_results (num_phis);
5510   auto_vec<tree, 16> initial_values (num_phis);
5511   if (edge main_loop_edge = loop_vinfo->main_loop_edge)
5512     {
5513       /* The epilogue loop can be entered either from the main loop or
5514          from an earlier guard block.  */
5515       edge skip_edge = loop_vinfo->skip_main_loop_edge;
5516       for (tree incoming_value : reduc_info->reduc_initial_values)
5517         {
5518           /* Look for:
5519
5520                INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
5521                                     INITIAL_VALUE(guard block)>.  */
5522           gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
5523
5524           gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
5525           gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
5526
5527           tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
5528           tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
5529
5530           main_loop_results.quick_push (from_main_loop);
5531           initial_values.quick_push (from_skip);
5532         }
5533     }
5534   else
5535     /* The main loop dominates the epilogue loop.  */
5536     main_loop_results.splice (reduc_info->reduc_initial_values);
5537
5538   /* See if the main loop has the kind of accumulator we need.  */
5539   vect_reusable_accumulator *accumulator
5540     = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
5541   if (!accumulator
5542       || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
5543       || !std::equal (main_loop_results.begin (), main_loop_results.end (),
5544                       accumulator->reduc_info->reduc_scalar_results.begin ()))
5545     return false;
5546
5547   /* Handle the case where we can reduce wider vectors to narrower ones.  */
5548   tree vectype = STMT_VINFO_VECTYPE (reduc_info);
5549   tree old_vectype = TREE_TYPE (accumulator->reduc_input);
5550   unsigned HOST_WIDE_INT m;
5551   if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
5552                             TYPE_VECTOR_SUBPARTS (vectype), &m))
5553     return false;
5554   /* Check the intermediate vector types and operations are available.  */
5555   tree prev_vectype = old_vectype;
5556   poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
5557   while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
5558     {
5559       intermediate_nunits = exact_div (intermediate_nunits, 2);
5560       tree intermediate_vectype = get_related_vectype_for_scalar_type
5561         (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
5562       if (!intermediate_vectype
5563           || !directly_supported_p (STMT_VINFO_REDUC_CODE (reduc_info),
5564                                     intermediate_vectype)
5565           || !can_vec_extract (TYPE_MODE (prev_vectype),
5566                                TYPE_MODE (intermediate_vectype)))
5567         return false;
5568       prev_vectype = intermediate_vectype;
5569     }
5570
5571   /* Non-SLP reductions might apply an adjustment after the reduction
5572      operation, in order to simplify the initialization of the accumulator.
5573      If the epilogue loop carries on from where the main loop left off,
5574      it should apply the same adjustment to the final reduction result.
5575
5576      If the epilogue loop can also be entered directly (rather than via
5577      the main loop), we need to be able to handle that case in the same way,
5578      with the same adjustment.  (In principle we could add a PHI node
5579      to select the correct adjustment, but in practice that shouldn't be
5580      necessary.)  */
5581   tree main_adjustment
5582     = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5583   if (loop_vinfo->main_loop_edge && main_adjustment)
5584     {
5585       gcc_assert (num_phis == 1);
5586       tree initial_value = initial_values[0];
5587       /* Check that we can use INITIAL_VALUE as the adjustment and
5588          initialize the accumulator with a neutral value instead.  */
5589       if (!operand_equal_p (initial_value, main_adjustment))
5590         return false;
5591       code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5592       initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
5593                                                     code, initial_value);
5594     }
5595   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5596   reduc_info->reduc_initial_values.truncate (0);
5597   reduc_info->reduc_initial_values.splice (initial_values);
5598   reduc_info->reused_accumulator = accumulator;
5599   return true;
5600 }
5601
5602 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5603    CODE emitting stmts before GSI.  Returns a vector def of VECTYPE.  */
5604
5605 static tree
5606 vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5607                             gimple_seq *seq)
5608 {
5609   unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5610   unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5611   tree stype = TREE_TYPE (vectype);
5612   tree new_temp = vec_def;
5613   while (nunits > nunits1)
5614     {
5615       nunits /= 2;
5616       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5617                                                            stype, nunits);
5618       unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5619
5620       /* The target has to make sure we support lowpart/highpart
5621          extraction, either via direct vector extract or through
5622          an integer mode punning.  */
5623       tree dst1, dst2;
5624       gimple *epilog_stmt;
5625       if (convert_optab_handler (vec_extract_optab,
5626                                  TYPE_MODE (TREE_TYPE (new_temp)),
5627                                  TYPE_MODE (vectype1))
5628           != CODE_FOR_nothing)
5629         {
5630           /* Extract sub-vectors directly once vec_extract becomes
5631              a conversion optab.  */
5632           dst1 = make_ssa_name (vectype1);
5633           epilog_stmt
5634               = gimple_build_assign (dst1, BIT_FIELD_REF,
5635                                      build3 (BIT_FIELD_REF, vectype1,
5636                                              new_temp, TYPE_SIZE (vectype1),
5637                                              bitsize_int (0)));
5638           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5639           dst2 =  make_ssa_name (vectype1);
5640           epilog_stmt
5641               = gimple_build_assign (dst2, BIT_FIELD_REF,
5642                                      build3 (BIT_FIELD_REF, vectype1,
5643                                              new_temp, TYPE_SIZE (vectype1),
5644                                              bitsize_int (bitsize)));
5645           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5646         }
5647       else
5648         {
5649           /* Extract via punning to appropriately sized integer mode
5650              vector.  */
5651           tree eltype = build_nonstandard_integer_type (bitsize, 1);
5652           tree etype = build_vector_type (eltype, 2);
5653           gcc_assert (convert_optab_handler (vec_extract_optab,
5654                                              TYPE_MODE (etype),
5655                                              TYPE_MODE (eltype))
5656                       != CODE_FOR_nothing);
5657           tree tem = make_ssa_name (etype);
5658           epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5659                                              build1 (VIEW_CONVERT_EXPR,
5660                                                      etype, new_temp));
5661           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5662           new_temp = tem;
5663           tem = make_ssa_name (eltype);
5664           epilog_stmt
5665               = gimple_build_assign (tem, BIT_FIELD_REF,
5666                                      build3 (BIT_FIELD_REF, eltype,
5667                                              new_temp, TYPE_SIZE (eltype),
5668                                              bitsize_int (0)));
5669           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5670           dst1 = make_ssa_name (vectype1);
5671           epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5672                                              build1 (VIEW_CONVERT_EXPR,
5673                                                      vectype1, tem));
5674           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5675           tem = make_ssa_name (eltype);
5676           epilog_stmt
5677               = gimple_build_assign (tem, BIT_FIELD_REF,
5678                                      build3 (BIT_FIELD_REF, eltype,
5679                                              new_temp, TYPE_SIZE (eltype),
5680                                              bitsize_int (bitsize)));
5681           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5682           dst2 =  make_ssa_name (vectype1);
5683           epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5684                                              build1 (VIEW_CONVERT_EXPR,
5685                                                      vectype1, tem));
5686           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5687         }
5688
5689       new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5690     }
5691
5692   return new_temp;
5693 }
5694
5695 /* Function vect_create_epilog_for_reduction
5696
5697    Create code at the loop-epilog to finalize the result of a reduction
5698    computation.
5699
5700    STMT_INFO is the scalar reduction stmt that is being vectorized.
5701    SLP_NODE is an SLP node containing a group of reduction statements. The
5702      first one in this group is STMT_INFO.
5703    SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5704    REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5705      (counting from 0)
5706
5707    This function:
5708    1. Completes the reduction def-use cycles.
5709    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5710       by calling the function specified by REDUC_FN if available, or by
5711       other means (whole-vector shifts or a scalar loop).
5712       The function also creates a new phi node at the loop exit to preserve
5713       loop-closed form, as illustrated below.
5714
5715      The flow at the entry to this function:
5716
5717         loop:
5718           vec_def = phi <vec_init, null>        # REDUCTION_PHI
5719           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5720           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5721         loop_exit:
5722           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5723           use <s_out0>
5724           use <s_out0>
5725
5726      The above is transformed by this function into:
5727
5728         loop:
5729           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
5730           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5731           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5732         loop_exit:
5733           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5734           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5735           v_out2 = reduce <v_out1>
5736           s_out3 = extract_field <v_out2, 0>
5737           s_out4 = adjust_result <s_out3>
5738           use <s_out4>
5739           use <s_out4>
5740 */
5741
5742 static void
5743 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5744                                   stmt_vec_info stmt_info,
5745                                   slp_tree slp_node,
5746                                   slp_instance slp_node_instance)
5747 {
5748   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5749   gcc_assert (reduc_info->is_reduc_info);
5750   /* For double reductions we need to get at the inner loop reduction
5751      stmt which has the meta info attached.  Our stmt_info is that of the
5752      loop-closed PHI of the inner loop which we remember as
5753      def for the reduction PHI generation.  */
5754   bool double_reduc = false;
5755   stmt_vec_info rdef_info = stmt_info;
5756   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5757     {
5758       gcc_assert (!slp_node);
5759       double_reduc = true;
5760       stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5761                                             (stmt_info->stmt, 0));
5762       stmt_info = vect_stmt_to_vectorize (stmt_info);
5763     }
5764   gphi *reduc_def_stmt
5765     = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
5766   code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5767   internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
5768   tree vectype;
5769   machine_mode mode;
5770   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
5771   basic_block exit_bb;
5772   tree scalar_dest;
5773   tree scalar_type;
5774   gimple *new_phi = NULL, *phi;
5775   gimple_stmt_iterator exit_gsi;
5776   tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5777   gimple *epilog_stmt = NULL;
5778   gimple *exit_phi;
5779   tree bitsize;
5780   tree def;
5781   tree orig_name, scalar_result;
5782   imm_use_iterator imm_iter, phi_imm_iter;
5783   use_operand_p use_p, phi_use_p;
5784   gimple *use_stmt;
5785   auto_vec<tree> reduc_inputs;
5786   int j, i;
5787   vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
5788   unsigned int group_size = 1, k;
5789   auto_vec<gimple *> phis;
5790   /* SLP reduction without reduction chain, e.g.,
5791      # a1 = phi <a2, a0>
5792      # b1 = phi <b2, b0>
5793      a2 = operation (a1)
5794      b2 = operation (b1)  */
5795   bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5796   bool direct_slp_reduc;
5797   tree induction_index = NULL_TREE;
5798
5799   if (slp_node)
5800     group_size = SLP_TREE_LANES (slp_node);
5801
5802   if (nested_in_vect_loop_p (loop, stmt_info))
5803     {
5804       outer_loop = loop;
5805       loop = loop->inner;
5806       gcc_assert (!slp_node && double_reduc);
5807     }
5808
5809   vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
5810   gcc_assert (vectype);
5811   mode = TYPE_MODE (vectype);
5812
5813   tree induc_val = NULL_TREE;
5814   tree adjustment_def = NULL;
5815   if (slp_node)
5816     ;
5817   else
5818     {
5819       /* Optimize: for induction condition reduction, if we can't use zero
5820          for induc_val, use initial_def.  */
5821       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5822         induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
5823       else if (double_reduc)
5824         ;
5825       else
5826         adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
5827     }
5828
5829   stmt_vec_info single_live_out_stmt[] = { stmt_info };
5830   array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
5831   if (slp_reduc)
5832     /* All statements produce live-out values.  */
5833     live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
5834   else if (slp_node)
5835     {
5836       /* The last statement in the reduction chain produces the live-out
5837          value.  Note SLP optimization can shuffle scalar stmts to
5838          optimize permutations so we have to search for the last stmt.  */
5839       for (k = 0; k < group_size; ++k)
5840         if (!REDUC_GROUP_NEXT_ELEMENT (SLP_TREE_SCALAR_STMTS (slp_node)[k]))
5841           {
5842             single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5843             break;
5844           }
5845     }
5846
5847   unsigned vec_num;
5848   int ncopies;
5849   if (slp_node)
5850     {
5851       vec_num = SLP_TREE_VEC_DEFS (slp_node_instance->reduc_phis).length ();
5852       ncopies = 1;
5853     }
5854   else
5855     {
5856       stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
5857       vec_num = 1;
5858       ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
5859     }
5860
5861   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5862      which is updated with the current index of the loop for every match of
5863      the original loop's cond_expr (VEC_STMT).  This results in a vector
5864      containing the last time the condition passed for that vector lane.
5865      The first match will be a 1 to allow 0 to be used for non-matching
5866      indexes.  If there are no matches at all then the vector will be all
5867      zeroes.
5868
5869      PR92772: This algorithm is broken for architectures that support
5870      masked vectors, but do not provide fold_extract_last.  */
5871   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
5872     {
5873       auto_vec<std::pair<tree, bool>, 2> ccompares;
5874       stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
5875       cond_info = vect_stmt_to_vectorize (cond_info);
5876       while (cond_info != reduc_info)
5877         {
5878           if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
5879             {
5880               gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
5881               gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
5882               ccompares.safe_push
5883                 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
5884                                  STMT_VINFO_REDUC_IDX (cond_info) == 2));
5885             }
5886           cond_info
5887             = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
5888                                                  1 + STMT_VINFO_REDUC_IDX
5889                                                         (cond_info)));
5890           cond_info = vect_stmt_to_vectorize (cond_info);
5891         }
5892       gcc_assert (ccompares.length () != 0);
5893
5894       tree indx_before_incr, indx_after_incr;
5895       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
5896       int scalar_precision
5897         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
5898       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
5899       tree cr_index_vector_type = get_related_vectype_for_scalar_type
5900         (TYPE_MODE (vectype), cr_index_scalar_type,
5901          TYPE_VECTOR_SUBPARTS (vectype));
5902
5903       /* First we create a simple vector induction variable which starts
5904          with the values {1,2,3,...} (SERIES_VECT) and increments by the
5905          vector size (STEP).  */
5906
5907       /* Create a {1,2,3,...} vector.  */
5908       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
5909
5910       /* Create a vector of the step value.  */
5911       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
5912       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
5913
5914       /* Create an induction variable.  */
5915       gimple_stmt_iterator incr_gsi;
5916       bool insert_after;
5917       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5918       create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, &incr_gsi,
5919                  insert_after, &indx_before_incr, &indx_after_incr);
5920
5921       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
5922          filled with zeros (VEC_ZERO).  */
5923
5924       /* Create a vector of 0s.  */
5925       tree zero = build_zero_cst (cr_index_scalar_type);
5926       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
5927
5928       /* Create a vector phi node.  */
5929       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
5930       new_phi = create_phi_node (new_phi_tree, loop->header);
5931       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
5932                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
5933
5934       /* Now take the condition from the loops original cond_exprs
5935          and produce a new cond_exprs (INDEX_COND_EXPR) which for
5936          every match uses values from the induction variable
5937          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5938          (NEW_PHI_TREE).
5939          Finally, we update the phi (NEW_PHI_TREE) to take the value of
5940          the new cond_expr (INDEX_COND_EXPR).  */
5941       gimple_seq stmts = NULL;
5942       for (int i = ccompares.length () - 1; i != -1; --i)
5943         {
5944           tree ccompare = ccompares[i].first;
5945           if (ccompares[i].second)
5946             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5947                                          cr_index_vector_type,
5948                                          ccompare,
5949                                          indx_before_incr, new_phi_tree);
5950           else
5951             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5952                                          cr_index_vector_type,
5953                                          ccompare,
5954                                          new_phi_tree, indx_before_incr);
5955         }
5956       gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
5957
5958       /* Update the phi with the vec cond.  */
5959       induction_index = new_phi_tree;
5960       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
5961                    loop_latch_edge (loop), UNKNOWN_LOCATION);
5962     }
5963
5964   /* 2. Create epilog code.
5965         The reduction epilog code operates across the elements of the vector
5966         of partial results computed by the vectorized loop.
5967         The reduction epilog code consists of:
5968
5969         step 1: compute the scalar result in a vector (v_out2)
5970         step 2: extract the scalar result (s_out3) from the vector (v_out2)
5971         step 3: adjust the scalar result (s_out3) if needed.
5972
5973         Step 1 can be accomplished using one the following three schemes:
5974           (scheme 1) using reduc_fn, if available.
5975           (scheme 2) using whole-vector shifts, if available.
5976           (scheme 3) using a scalar loop. In this case steps 1+2 above are
5977                      combined.
5978
5979           The overall epilog code looks like this:
5980
5981           s_out0 = phi <s_loop>         # original EXIT_PHI
5982           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
5983           v_out2 = reduce <v_out1>              # step 1
5984           s_out3 = extract_field <v_out2, 0>    # step 2
5985           s_out4 = adjust_result <s_out3>       # step 3
5986
5987           (step 3 is optional, and steps 1 and 2 may be combined).
5988           Lastly, the uses of s_out0 are replaced by s_out4.  */
5989
5990
5991   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5992          v_out1 = phi <VECT_DEF>
5993          Store them in NEW_PHIS.  */
5994   if (double_reduc)
5995     loop = outer_loop;
5996   exit_bb = single_exit (loop)->dest;
5997   exit_gsi = gsi_after_labels (exit_bb);
5998   reduc_inputs.create (slp_node ? vec_num : ncopies);
5999   for (unsigned i = 0; i < vec_num; i++)
6000     {
6001       gimple_seq stmts = NULL;
6002       if (slp_node)
6003         def = vect_get_slp_vect_def (slp_node, i);
6004       else
6005         def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
6006       for (j = 0; j < ncopies; j++)
6007         {
6008           tree new_def = copy_ssa_name (def);
6009           phi = create_phi_node (new_def, exit_bb);
6010           if (j)
6011             def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
6012           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
6013           new_def = gimple_convert (&stmts, vectype, new_def);
6014           reduc_inputs.quick_push (new_def);
6015         }
6016       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6017     }
6018
6019   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
6020          (i.e. when reduc_fn is not available) and in the final adjustment
6021          code (if needed).  Also get the original scalar reduction variable as
6022          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
6023          represents a reduction pattern), the tree-code and scalar-def are
6024          taken from the original stmt that the pattern-stmt (STMT) replaces.
6025          Otherwise (it is a regular reduction) - the tree-code and scalar-def
6026          are taken from STMT.  */
6027
6028   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6029   if (orig_stmt_info != stmt_info)
6030     {
6031       /* Reduction pattern  */
6032       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6033       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
6034     }
6035
6036   scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
6037   scalar_type = TREE_TYPE (scalar_dest);
6038   scalar_results.truncate (0);
6039   scalar_results.reserve_exact (group_size);
6040   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
6041   bitsize = TYPE_SIZE (scalar_type);
6042
6043   /* True if we should implement SLP_REDUC using native reduction operations
6044      instead of scalar operations.  */
6045   direct_slp_reduc = (reduc_fn != IFN_LAST
6046                       && slp_reduc
6047                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
6048
6049   /* In case of reduction chain, e.g.,
6050      # a1 = phi <a3, a0>
6051      a2 = operation (a1)
6052      a3 = operation (a2),
6053
6054      we may end up with more than one vector result.  Here we reduce them
6055      to one vector.
6056
6057      The same is true for a SLP reduction, e.g.,
6058      # a1 = phi <a2, a0>
6059      # b1 = phi <b2, b0>
6060      a2 = operation (a1)
6061      b2 = operation (a2),
6062
6063      where we can end up with more than one vector as well.  We can
6064      easily accumulate vectors when the number of vector elements is
6065      a multiple of the SLP group size.
6066
6067      The same is true if we couldn't use a single defuse cycle.  */
6068   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6069       || direct_slp_reduc
6070       || (slp_reduc
6071           && constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype), group_size))
6072       || ncopies > 1)
6073     {
6074       gimple_seq stmts = NULL;
6075       tree single_input = reduc_inputs[0];
6076       for (k = 1; k < reduc_inputs.length (); k++)
6077         single_input = gimple_build (&stmts, code, vectype,
6078                                      single_input, reduc_inputs[k]);
6079       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6080
6081       reduc_inputs.truncate (0);
6082       reduc_inputs.safe_push (single_input);
6083     }
6084
6085   tree orig_reduc_input = reduc_inputs[0];
6086
6087   /* If this loop is an epilogue loop that can be skipped after the
6088      main loop, we can only share a reduction operation between the
6089      main loop and the epilogue if we put it at the target of the
6090      skip edge.
6091
6092      We can still reuse accumulators if this check fails.  Doing so has
6093      the minor(?) benefit of making the epilogue loop's scalar result
6094      independent of the main loop's scalar result.  */
6095   bool unify_with_main_loop_p = false;
6096   if (reduc_info->reused_accumulator
6097       && loop_vinfo->skip_this_loop_edge
6098       && single_succ_p (exit_bb)
6099       && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
6100     {
6101       unify_with_main_loop_p = true;
6102
6103       basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
6104       reduc_inputs[0] = make_ssa_name (vectype);
6105       gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
6106       add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
6107                    UNKNOWN_LOCATION);
6108       add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
6109                    loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
6110       exit_gsi = gsi_after_labels (reduc_block);
6111     }
6112
6113   /* Shouldn't be used beyond this point.  */
6114   exit_bb = nullptr;
6115
6116   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6117       && reduc_fn != IFN_LAST)
6118     {
6119       /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
6120          various data values where the condition matched and another vector
6121          (INDUCTION_INDEX) containing all the indexes of those matches.  We
6122          need to extract the last matching index (which will be the index with
6123          highest value) and use this to index into the data vector.
6124          For the case where there were no matches, the data vector will contain
6125          all default values and the index vector will be all zeros.  */
6126
6127       /* Get various versions of the type of the vector of indexes.  */
6128       tree index_vec_type = TREE_TYPE (induction_index);
6129       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
6130       tree index_scalar_type = TREE_TYPE (index_vec_type);
6131       tree index_vec_cmp_type = truth_type_for (index_vec_type);
6132
6133       /* Get an unsigned integer version of the type of the data vector.  */
6134       int scalar_precision
6135         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6136       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
6137       tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
6138                                                 vectype);
6139
6140       /* First we need to create a vector (ZERO_VEC) of zeros and another
6141          vector (MAX_INDEX_VEC) filled with the last matching index, which we
6142          can create using a MAX reduction and then expanding.
6143          In the case where the loop never made any matches, the max index will
6144          be zero.  */
6145
6146       /* Vector of {0, 0, 0,...}.  */
6147       tree zero_vec = build_zero_cst (vectype);
6148
6149       /* Find maximum value from the vector of found indexes.  */
6150       tree max_index = make_ssa_name (index_scalar_type);
6151       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6152                                                           1, induction_index);
6153       gimple_call_set_lhs (max_index_stmt, max_index);
6154       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
6155
6156       /* Vector of {max_index, max_index, max_index,...}.  */
6157       tree max_index_vec = make_ssa_name (index_vec_type);
6158       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
6159                                                       max_index);
6160       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
6161                                                         max_index_vec_rhs);
6162       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
6163
6164       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
6165          with the vector (INDUCTION_INDEX) of found indexes, choosing values
6166          from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
6167          otherwise.  Only one value should match, resulting in a vector
6168          (VEC_COND) with one data value and the rest zeros.
6169          In the case where the loop never made any matches, every index will
6170          match, resulting in a vector with all data values (which will all be
6171          the default value).  */
6172
6173       /* Compare the max index vector to the vector of found indexes to find
6174          the position of the max value.  */
6175       tree vec_compare = make_ssa_name (index_vec_cmp_type);
6176       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
6177                                                       induction_index,
6178                                                       max_index_vec);
6179       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
6180
6181       /* Use the compare to choose either values from the data vector or
6182          zero.  */
6183       tree vec_cond = make_ssa_name (vectype);
6184       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
6185                                                    vec_compare,
6186                                                    reduc_inputs[0],
6187                                                    zero_vec);
6188       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
6189
6190       /* Finally we need to extract the data value from the vector (VEC_COND)
6191          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
6192          reduction, but because this doesn't exist, we can use a MAX reduction
6193          instead.  The data value might be signed or a float so we need to cast
6194          it first.
6195          In the case where the loop never made any matches, the data values are
6196          all identical, and so will reduce down correctly.  */
6197
6198       /* Make the matched data values unsigned.  */
6199       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
6200       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
6201                                        vec_cond);
6202       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
6203                                                         VIEW_CONVERT_EXPR,
6204                                                         vec_cond_cast_rhs);
6205       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
6206
6207       /* Reduce down to a scalar value.  */
6208       tree data_reduc = make_ssa_name (scalar_type_unsigned);
6209       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6210                                                            1, vec_cond_cast);
6211       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
6212       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
6213
6214       /* Convert the reduced value back to the result type and set as the
6215          result.  */
6216       gimple_seq stmts = NULL;
6217       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
6218                                data_reduc);
6219       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6220       scalar_results.safe_push (new_temp);
6221     }
6222   else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6223            && reduc_fn == IFN_LAST)
6224     {
6225       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
6226          idx = 0;
6227          idx_val = induction_index[0];
6228          val = data_reduc[0];
6229          for (idx = 0, val = init, i = 0; i < nelts; ++i)
6230            if (induction_index[i] > idx_val)
6231              val = data_reduc[i], idx_val = induction_index[i];
6232          return val;  */
6233
6234       tree data_eltype = TREE_TYPE (vectype);
6235       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
6236       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
6237       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
6238       /* Enforced by vectorizable_reduction, which ensures we have target
6239          support before allowing a conditional reduction on variable-length
6240          vectors.  */
6241       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
6242       tree idx_val = NULL_TREE, val = NULL_TREE;
6243       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
6244         {
6245           tree old_idx_val = idx_val;
6246           tree old_val = val;
6247           idx_val = make_ssa_name (idx_eltype);
6248           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
6249                                              build3 (BIT_FIELD_REF, idx_eltype,
6250                                                      induction_index,
6251                                                      bitsize_int (el_size),
6252                                                      bitsize_int (off)));
6253           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6254           val = make_ssa_name (data_eltype);
6255           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
6256                                              build3 (BIT_FIELD_REF,
6257                                                      data_eltype,
6258                                                      reduc_inputs[0],
6259                                                      bitsize_int (el_size),
6260                                                      bitsize_int (off)));
6261           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6262           if (off != 0)
6263             {
6264               tree new_idx_val = idx_val;
6265               if (off != v_size - el_size)
6266                 {
6267                   new_idx_val = make_ssa_name (idx_eltype);
6268                   epilog_stmt = gimple_build_assign (new_idx_val,
6269                                                      MAX_EXPR, idx_val,
6270                                                      old_idx_val);
6271                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6272                 }
6273               tree cond = make_ssa_name (boolean_type_node);
6274               epilog_stmt = gimple_build_assign (cond, GT_EXPR,
6275                                                  idx_val, old_idx_val);
6276               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6277               tree new_val = make_ssa_name (data_eltype);
6278               epilog_stmt = gimple_build_assign (new_val, COND_EXPR,
6279                                                  cond, val, old_val);
6280               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6281               idx_val = new_idx_val;
6282               val = new_val;
6283             }
6284         }
6285       /* Convert the reduced value back to the result type and set as the
6286          result.  */
6287       gimple_seq stmts = NULL;
6288       val = gimple_convert (&stmts, scalar_type, val);
6289       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6290       scalar_results.safe_push (val);
6291     }
6292
6293   /* 2.3 Create the reduction code, using one of the three schemes described
6294          above. In SLP we simply need to extract all the elements from the
6295          vector (without reducing them), so we use scalar shifts.  */
6296   else if (reduc_fn != IFN_LAST && !slp_reduc)
6297     {
6298       tree tmp;
6299       tree vec_elem_type;
6300
6301       /* Case 1:  Create:
6302          v_out2 = reduc_expr <v_out1>  */
6303
6304       if (dump_enabled_p ())
6305         dump_printf_loc (MSG_NOTE, vect_location,
6306                          "Reduce using direct vector reduction.\n");
6307
6308       gimple_seq stmts = NULL;
6309       vec_elem_type = TREE_TYPE (vectype);
6310       new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
6311                                vec_elem_type, reduc_inputs[0]);
6312       new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6313       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6314
6315       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6316           && induc_val)
6317         {
6318           /* Earlier we set the initial value to be a vector if induc_val
6319              values.  Check the result and if it is induc_val then replace
6320              with the original initial value, unless induc_val is
6321              the same as initial_def already.  */
6322           tree zcompare = make_ssa_name (boolean_type_node);
6323           epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
6324                                              new_temp, induc_val);
6325           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6326           tree initial_def = reduc_info->reduc_initial_values[0];
6327           tmp = make_ssa_name (new_scalar_dest);
6328           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6329                                              initial_def, new_temp);
6330           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6331           new_temp = tmp;
6332         }
6333
6334       scalar_results.safe_push (new_temp);
6335     }
6336   else if (direct_slp_reduc)
6337     {
6338       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
6339          with the elements for other SLP statements replaced with the
6340          neutral value.  We can then do a normal reduction on each vector.  */
6341
6342       /* Enforced by vectorizable_reduction.  */
6343       gcc_assert (reduc_inputs.length () == 1);
6344       gcc_assert (pow2p_hwi (group_size));
6345
6346       gimple_seq seq = NULL;
6347
6348       /* Build a vector {0, 1, 2, ...}, with the same number of elements
6349          and the same element size as VECTYPE.  */
6350       tree index = build_index_vector (vectype, 0, 1);
6351       tree index_type = TREE_TYPE (index);
6352       tree index_elt_type = TREE_TYPE (index_type);
6353       tree mask_type = truth_type_for (index_type);
6354
6355       /* Create a vector that, for each element, identifies which of
6356          the REDUC_GROUP_SIZE results should use it.  */
6357       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
6358       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
6359                             build_vector_from_val (index_type, index_mask));
6360
6361       /* Get a neutral vector value.  This is simply a splat of the neutral
6362          scalar value if we have one, otherwise the initial scalar value
6363          is itself a neutral value.  */
6364       tree vector_identity = NULL_TREE;
6365       tree neutral_op = NULL_TREE;
6366       if (slp_node)
6367         {
6368           tree initial_value = NULL_TREE;
6369           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6370             initial_value = reduc_info->reduc_initial_values[0];
6371           neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
6372                                                  initial_value);
6373         }
6374       if (neutral_op)
6375         vector_identity = gimple_build_vector_from_val (&seq, vectype,
6376                                                         neutral_op);
6377       for (unsigned int i = 0; i < group_size; ++i)
6378         {
6379           /* If there's no univeral neutral value, we can use the
6380              initial scalar value from the original PHI.  This is used
6381              for MIN and MAX reduction, for example.  */
6382           if (!neutral_op)
6383             {
6384               tree scalar_value = reduc_info->reduc_initial_values[i];
6385               scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
6386                                              scalar_value);
6387               vector_identity = gimple_build_vector_from_val (&seq, vectype,
6388                                                               scalar_value);
6389             }
6390
6391           /* Calculate the equivalent of:
6392
6393              sel[j] = (index[j] == i);
6394
6395              which selects the elements of REDUC_INPUTS[0] that should
6396              be included in the result.  */
6397           tree compare_val = build_int_cst (index_elt_type, i);
6398           compare_val = build_vector_from_val (index_type, compare_val);
6399           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
6400                                    index, compare_val);
6401
6402           /* Calculate the equivalent of:
6403
6404              vec = seq ? reduc_inputs[0] : vector_identity;
6405
6406              VEC is now suitable for a full vector reduction.  */
6407           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
6408                                    sel, reduc_inputs[0], vector_identity);
6409
6410           /* Do the reduction and convert it to the appropriate type.  */
6411           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
6412                                       TREE_TYPE (vectype), vec);
6413           scalar = gimple_convert (&seq, scalar_type, scalar);
6414           scalar_results.safe_push (scalar);
6415         }
6416       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
6417     }
6418   else
6419     {
6420       bool reduce_with_shift;
6421       tree vec_temp;
6422
6423       gcc_assert (slp_reduc || reduc_inputs.length () == 1);
6424
6425       /* See if the target wants to do the final (shift) reduction
6426          in a vector mode of smaller size and first reduce upper/lower
6427          halves against each other.  */
6428       enum machine_mode mode1 = mode;
6429       tree stype = TREE_TYPE (vectype);
6430       unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6431       unsigned nunits1 = nunits;
6432       if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
6433           && reduc_inputs.length () == 1)
6434         {
6435           nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6436           /* For SLP reductions we have to make sure lanes match up, but
6437              since we're doing individual element final reduction reducing
6438              vector width here is even more important.
6439              ???  We can also separate lanes with permutes, for the common
6440              case of power-of-two group-size odd/even extracts would work.  */
6441           if (slp_reduc && nunits != nunits1)
6442             {
6443               nunits1 = least_common_multiple (nunits1, group_size);
6444               gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
6445             }
6446         }
6447       if (!slp_reduc
6448           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
6449         nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6450
6451       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
6452                                                            stype, nunits1);
6453       reduce_with_shift = have_whole_vector_shift (mode1);
6454       if (!VECTOR_MODE_P (mode1)
6455           || !directly_supported_p (code, vectype1))
6456         reduce_with_shift = false;
6457
6458       /* First reduce the vector to the desired vector size we should
6459          do shift reduction on by combining upper and lower halves.  */
6460       gimple_seq stmts = NULL;
6461       new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
6462                                              code, &stmts);
6463       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6464       reduc_inputs[0] = new_temp;
6465
6466       if (reduce_with_shift && !slp_reduc)
6467         {
6468           int element_bitsize = tree_to_uhwi (bitsize);
6469           /* Enforced by vectorizable_reduction, which disallows SLP reductions
6470              for variable-length vectors and also requires direct target support
6471              for loop reductions.  */
6472           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6473           int nelements = vec_size_in_bits / element_bitsize;
6474           vec_perm_builder sel;
6475           vec_perm_indices indices;
6476
6477           int elt_offset;
6478
6479           tree zero_vec = build_zero_cst (vectype1);
6480           /* Case 2: Create:
6481              for (offset = nelements/2; offset >= 1; offset/=2)
6482                 {
6483                   Create:  va' = vec_shift <va, offset>
6484                   Create:  va = vop <va, va'>
6485                 }  */
6486
6487           tree rhs;
6488
6489           if (dump_enabled_p ())
6490             dump_printf_loc (MSG_NOTE, vect_location,
6491                              "Reduce using vector shifts\n");
6492
6493           gimple_seq stmts = NULL;
6494           new_temp = gimple_convert (&stmts, vectype1, new_temp);
6495           for (elt_offset = nelements / 2;
6496                elt_offset >= 1;
6497                elt_offset /= 2)
6498             {
6499               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
6500               indices.new_vector (sel, 2, nelements);
6501               tree mask = vect_gen_perm_mask_any (vectype1, indices);
6502               new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
6503                                        new_temp, zero_vec, mask);
6504               new_temp = gimple_build (&stmts, code,
6505                                        vectype1, new_name, new_temp);
6506             }
6507           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6508
6509           /* 2.4  Extract the final scalar result.  Create:
6510              s_out3 = extract_field <v_out2, bitpos>  */
6511
6512           if (dump_enabled_p ())
6513             dump_printf_loc (MSG_NOTE, vect_location,
6514                              "extract scalar result\n");
6515
6516           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
6517                         bitsize, bitsize_zero_node);
6518           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
6519           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
6520           gimple_assign_set_lhs (epilog_stmt, new_temp);
6521           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6522           scalar_results.safe_push (new_temp);
6523         }
6524       else
6525         {
6526           /* Case 3: Create:
6527              s = extract_field <v_out2, 0>
6528              for (offset = element_size;
6529                   offset < vector_size;
6530                   offset += element_size;)
6531                {
6532                  Create:  s' = extract_field <v_out2, offset>
6533                  Create:  s = op <s, s'>  // For non SLP cases
6534                }  */
6535
6536           if (dump_enabled_p ())
6537             dump_printf_loc (MSG_NOTE, vect_location,
6538                              "Reduce using scalar code.\n");
6539
6540           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6541           int element_bitsize = tree_to_uhwi (bitsize);
6542           tree compute_type = TREE_TYPE (vectype);
6543           gimple_seq stmts = NULL;
6544           FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
6545             {
6546               int bit_offset;
6547               new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
6548                                        vec_temp, bitsize, bitsize_zero_node);
6549
6550               /* In SLP we don't need to apply reduction operation, so we just
6551                  collect s' values in SCALAR_RESULTS.  */
6552               if (slp_reduc)
6553                 scalar_results.safe_push (new_temp);
6554
6555               for (bit_offset = element_bitsize;
6556                    bit_offset < vec_size_in_bits;
6557                    bit_offset += element_bitsize)
6558                 {
6559                   tree bitpos = bitsize_int (bit_offset);
6560                   new_name = gimple_build (&stmts, BIT_FIELD_REF,
6561                                            compute_type, vec_temp,
6562                                            bitsize, bitpos);
6563                   if (slp_reduc)
6564                     {
6565                       /* In SLP we don't need to apply reduction operation, so
6566                          we just collect s' values in SCALAR_RESULTS.  */
6567                       new_temp = new_name;
6568                       scalar_results.safe_push (new_name);
6569                     }
6570                   else
6571                     new_temp = gimple_build (&stmts, code, compute_type,
6572                                              new_name, new_temp);
6573                 }
6574             }
6575
6576           /* The only case where we need to reduce scalar results in SLP, is
6577              unrolling.  If the size of SCALAR_RESULTS is greater than
6578              REDUC_GROUP_SIZE, we reduce them combining elements modulo
6579              REDUC_GROUP_SIZE.  */
6580           if (slp_reduc)
6581             {
6582               tree res, first_res, new_res;
6583
6584               /* Reduce multiple scalar results in case of SLP unrolling.  */
6585               for (j = group_size; scalar_results.iterate (j, &res);
6586                    j++)
6587                 {
6588                   first_res = scalar_results[j % group_size];
6589                   new_res = gimple_build (&stmts, code, compute_type,
6590                                           first_res, res);
6591                   scalar_results[j % group_size] = new_res;
6592                 }
6593               scalar_results.truncate (group_size);
6594               for (k = 0; k < group_size; k++)
6595                 scalar_results[k] = gimple_convert (&stmts, scalar_type,
6596                                                     scalar_results[k]);
6597             }
6598           else
6599             {
6600               /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
6601               new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6602               scalar_results.safe_push (new_temp);
6603             }
6604
6605           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6606         }
6607
6608       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6609           && induc_val)
6610         {
6611           /* Earlier we set the initial value to be a vector if induc_val
6612              values.  Check the result and if it is induc_val then replace
6613              with the original initial value, unless induc_val is
6614              the same as initial_def already.  */
6615           tree zcompare = make_ssa_name (boolean_type_node);
6616           epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR, new_temp,
6617                                              induc_val);
6618           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6619           tree initial_def = reduc_info->reduc_initial_values[0];
6620           tree tmp = make_ssa_name (new_scalar_dest);
6621           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6622                                              initial_def, new_temp);
6623           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6624           scalar_results[0] = tmp;
6625         }
6626     }
6627
6628   /* 2.5 Adjust the final result by the initial value of the reduction
6629          variable. (When such adjustment is not needed, then
6630          'adjustment_def' is zero).  For example, if code is PLUS we create:
6631          new_temp = loop_exit_def + adjustment_def  */
6632
6633   if (adjustment_def)
6634     {
6635       gcc_assert (!slp_reduc);
6636       gimple_seq stmts = NULL;
6637       if (double_reduc)
6638         {
6639           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6640           adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6641           new_temp = gimple_build (&stmts, code, vectype,
6642                                    reduc_inputs[0], adjustment_def);
6643         }
6644       else
6645         {
6646           new_temp = scalar_results[0];
6647           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6648           adjustment_def = gimple_convert (&stmts, TREE_TYPE (vectype),
6649                                            adjustment_def);
6650           new_temp = gimple_convert (&stmts, TREE_TYPE (vectype), new_temp);
6651           new_temp = gimple_build (&stmts, code, TREE_TYPE (vectype),
6652                                    new_temp, adjustment_def);
6653           new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6654         }
6655
6656       epilog_stmt = gimple_seq_last_stmt (stmts);
6657       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6658       scalar_results[0] = new_temp;
6659     }
6660
6661   /* Record this operation if it could be reused by the epilogue loop.  */
6662   if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION
6663       && reduc_inputs.length () == 1)
6664     loop_vinfo->reusable_accumulators.put (scalar_results[0],
6665                                            { orig_reduc_input, reduc_info });
6666
6667   if (double_reduc)
6668     loop = outer_loop;
6669
6670   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
6671           phis with new adjusted scalar results, i.e., replace use <s_out0>
6672           with use <s_out4>.
6673
6674      Transform:
6675         loop_exit:
6676           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6677           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6678           v_out2 = reduce <v_out1>
6679           s_out3 = extract_field <v_out2, 0>
6680           s_out4 = adjust_result <s_out3>
6681           use <s_out0>
6682           use <s_out0>
6683
6684      into:
6685
6686         loop_exit:
6687           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6688           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6689           v_out2 = reduce <v_out1>
6690           s_out3 = extract_field <v_out2, 0>
6691           s_out4 = adjust_result <s_out3>
6692           use <s_out4>
6693           use <s_out4> */
6694
6695   gcc_assert (live_out_stmts.size () == scalar_results.length ());
6696   for (k = 0; k < live_out_stmts.size (); k++)
6697     {
6698       stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6699       scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6700
6701       phis.create (3);
6702       /* Find the loop-closed-use at the loop exit of the original scalar
6703          result.  (The reduction result is expected to have two immediate uses,
6704          one at the latch block, and one at the loop exit).  For double
6705          reductions we are looking for exit phis of the outer loop.  */
6706       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6707         {
6708           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6709             {
6710               if (!is_gimple_debug (USE_STMT (use_p)))
6711                 phis.safe_push (USE_STMT (use_p));
6712             }
6713           else
6714             {
6715               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6716                 {
6717                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
6718
6719                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6720                     {
6721                       if (!flow_bb_inside_loop_p (loop,
6722                                              gimple_bb (USE_STMT (phi_use_p)))
6723                           && !is_gimple_debug (USE_STMT (phi_use_p)))
6724                         phis.safe_push (USE_STMT (phi_use_p));
6725                     }
6726                 }
6727             }
6728         }
6729
6730       FOR_EACH_VEC_ELT (phis, i, exit_phi)
6731         {
6732           /* Replace the uses:  */
6733           orig_name = PHI_RESULT (exit_phi);
6734
6735           /* Look for a single use at the target of the skip edge.  */
6736           if (unify_with_main_loop_p)
6737             {
6738               use_operand_p use_p;
6739               gimple *user;
6740               if (!single_imm_use (orig_name, &use_p, &user))
6741                 gcc_unreachable ();
6742               orig_name = gimple_get_lhs (user);
6743             }
6744
6745           scalar_result = scalar_results[k];
6746           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6747             {
6748               FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6749                 SET_USE (use_p, scalar_result);
6750               update_stmt (use_stmt);
6751             }
6752         }
6753
6754       phis.release ();
6755     }
6756 }
6757
6758 /* Return a vector of type VECTYPE that is equal to the vector select
6759    operation "MASK ? VEC : IDENTITY".  Insert the select statements
6760    before GSI.  */
6761
6762 static tree
6763 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6764                      tree vec, tree identity)
6765 {
6766   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6767   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6768                                           mask, vec, identity);
6769   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6770   return cond;
6771 }
6772
6773 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6774    order, starting with LHS.  Insert the extraction statements before GSI and
6775    associate the new scalar SSA names with variable SCALAR_DEST.
6776    Return the SSA name for the result.  */
6777
6778 static tree
6779 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6780                        tree_code code, tree lhs, tree vector_rhs)
6781 {
6782   tree vectype = TREE_TYPE (vector_rhs);
6783   tree scalar_type = TREE_TYPE (vectype);
6784   tree bitsize = TYPE_SIZE (scalar_type);
6785   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6786   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6787
6788   for (unsigned HOST_WIDE_INT bit_offset = 0;
6789        bit_offset < vec_size_in_bits;
6790        bit_offset += element_bitsize)
6791     {
6792       tree bitpos = bitsize_int (bit_offset);
6793       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6794                          bitsize, bitpos);
6795
6796       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6797       rhs = make_ssa_name (scalar_dest, stmt);
6798       gimple_assign_set_lhs (stmt, rhs);
6799       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6800
6801       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6802       tree new_name = make_ssa_name (scalar_dest, stmt);
6803       gimple_assign_set_lhs (stmt, new_name);
6804       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6805       lhs = new_name;
6806     }
6807   return lhs;
6808 }
6809
6810 /* Get a masked internal function equivalent to REDUC_FN.  VECTYPE_IN is the
6811    type of the vector input.  */
6812
6813 static internal_fn
6814 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6815 {
6816   internal_fn mask_reduc_fn;
6817   internal_fn mask_len_reduc_fn;
6818
6819   switch (reduc_fn)
6820     {
6821     case IFN_FOLD_LEFT_PLUS:
6822       mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6823       mask_len_reduc_fn = IFN_MASK_LEN_FOLD_LEFT_PLUS;
6824       break;
6825
6826     default:
6827       return IFN_LAST;
6828     }
6829
6830   if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
6831                                       OPTIMIZE_FOR_SPEED))
6832     return mask_reduc_fn;
6833   if (direct_internal_fn_supported_p (mask_len_reduc_fn, vectype_in,
6834                                       OPTIMIZE_FOR_SPEED))
6835     return mask_len_reduc_fn;
6836   return IFN_LAST;
6837 }
6838
6839 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
6840    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
6841    statement.  CODE is the operation performed by STMT_INFO and OPS are
6842    its scalar operands.  REDUC_INDEX is the index of the operand in
6843    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
6844    implements in-order reduction, or IFN_LAST if we should open-code it.
6845    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
6846    that should be used to control the operation in a fully-masked loop.  */
6847
6848 static bool
6849 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
6850                                stmt_vec_info stmt_info,
6851                                gimple_stmt_iterator *gsi,
6852                                gimple **vec_stmt, slp_tree slp_node,
6853                                gimple *reduc_def_stmt,
6854                                tree_code code, internal_fn reduc_fn,
6855                                tree ops[3], tree vectype_in,
6856                                int reduc_index, vec_loop_masks *masks,
6857                                vec_loop_lens *lens)
6858 {
6859   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6860   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6861   internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
6862
6863   int ncopies;
6864   if (slp_node)
6865     ncopies = 1;
6866   else
6867     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6868
6869   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
6870   gcc_assert (ncopies == 1);
6871   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6872
6873   if (slp_node)
6874     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6875                           TYPE_VECTOR_SUBPARTS (vectype_in)));
6876
6877   tree op0 = ops[1 - reduc_index];
6878
6879   int group_size = 1;
6880   stmt_vec_info scalar_dest_def_info;
6881   auto_vec<tree> vec_oprnds0;
6882   if (slp_node)
6883     {
6884       auto_vec<vec<tree> > vec_defs (2);
6885       vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
6886       vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
6887       vec_defs[0].release ();
6888       vec_defs[1].release ();
6889       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6890       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6891     }
6892   else
6893     {
6894       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
6895                                      op0, &vec_oprnds0);
6896       scalar_dest_def_info = stmt_info;
6897     }
6898
6899   tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
6900   tree scalar_type = TREE_TYPE (scalar_dest);
6901   tree reduc_var = gimple_phi_result (reduc_def_stmt);
6902
6903   int vec_num = vec_oprnds0.length ();
6904   gcc_assert (vec_num == 1 || slp_node);
6905   tree vec_elem_type = TREE_TYPE (vectype_out);
6906   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6907
6908   tree vector_identity = NULL_TREE;
6909   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6910     {
6911       vector_identity = build_zero_cst (vectype_out);
6912       if (!HONOR_SIGNED_ZEROS (vectype_out))
6913         ;
6914       else
6915         {
6916           gcc_assert (!HONOR_SIGN_DEPENDENT_ROUNDING (vectype_out));
6917           vector_identity = const_unop (NEGATE_EXPR, vectype_out,
6918                                         vector_identity);
6919         }
6920     }
6921
6922   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6923   int i;
6924   tree def0;
6925   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6926     {
6927       gimple *new_stmt;
6928       tree mask = NULL_TREE;
6929       tree len = NULL_TREE;
6930       tree bias = NULL_TREE;
6931       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6932         mask = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num, vectype_in, i);
6933       if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
6934         {
6935           len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
6936                                    i, 1);
6937           signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
6938           bias = build_int_cst (intQI_type_node, biasval);
6939           mask = build_minus_one_cst (truth_type_for (vectype_in));
6940         }
6941
6942       /* Handle MINUS by adding the negative.  */
6943       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6944         {
6945           tree negated = make_ssa_name (vectype_out);
6946           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6947           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6948           def0 = negated;
6949         }
6950
6951       if (mask && mask_reduc_fn == IFN_LAST)
6952         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6953                                     vector_identity);
6954
6955       /* On the first iteration the input is simply the scalar phi
6956          result, and for subsequent iterations it is the output of
6957          the preceding operation.  */
6958       if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6959         {
6960           if (mask && len && mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
6961             new_stmt = gimple_build_call_internal (mask_reduc_fn, 5, reduc_var,
6962                                                    def0, mask, len, bias);
6963           else if (mask && mask_reduc_fn == IFN_MASK_FOLD_LEFT_PLUS)
6964             new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6965                                                    def0, mask);
6966           else
6967             new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6968                                                    def0);
6969           /* For chained SLP reductions the output of the previous reduction
6970              operation serves as the input of the next. For the final statement
6971              the output cannot be a temporary - we reuse the original
6972              scalar destination of the last statement.  */
6973           if (i != vec_num - 1)
6974             {
6975               gimple_set_lhs (new_stmt, scalar_dest_var);
6976               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6977               gimple_set_lhs (new_stmt, reduc_var);
6978             }
6979         }
6980       else
6981         {
6982           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6983                                              reduc_var, def0);
6984           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6985           /* Remove the statement, so that we can use the same code paths
6986              as for statements that we've just created.  */
6987           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6988           gsi_remove (&tmp_gsi, true);
6989         }
6990
6991       if (i == vec_num - 1)
6992         {
6993           gimple_set_lhs (new_stmt, scalar_dest);
6994           vect_finish_replace_stmt (loop_vinfo,
6995                                     scalar_dest_def_info,
6996                                     new_stmt);
6997         }
6998       else
6999         vect_finish_stmt_generation (loop_vinfo,
7000                                      scalar_dest_def_info,
7001                                      new_stmt, gsi);
7002
7003       if (slp_node)
7004         slp_node->push_vec_def (new_stmt);
7005       else
7006         {
7007           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7008           *vec_stmt = new_stmt;
7009         }
7010     }
7011
7012   return true;
7013 }
7014
7015 /* Function is_nonwrapping_integer_induction.
7016
7017    Check if STMT_VINO (which is part of loop LOOP) both increments and
7018    does not cause overflow.  */
7019
7020 static bool
7021 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
7022 {
7023   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
7024   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
7025   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
7026   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
7027   widest_int ni, max_loop_value, lhs_max;
7028   wi::overflow_type overflow = wi::OVF_NONE;
7029
7030   /* Make sure the loop is integer based.  */
7031   if (TREE_CODE (base) != INTEGER_CST
7032       || TREE_CODE (step) != INTEGER_CST)
7033     return false;
7034
7035   /* Check that the max size of the loop will not wrap.  */
7036
7037   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
7038     return true;
7039
7040   if (! max_stmt_executions (loop, &ni))
7041     return false;
7042
7043   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
7044                             &overflow);
7045   if (overflow)
7046     return false;
7047
7048   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
7049                             TYPE_SIGN (lhs_type), &overflow);
7050   if (overflow)
7051     return false;
7052
7053   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
7054           <= TYPE_PRECISION (lhs_type));
7055 }
7056
7057 /* Check if masking can be supported by inserting a conditional expression.
7058    CODE is the code for the operation.  COND_FN is the conditional internal
7059    function, if it exists.  VECTYPE_IN is the type of the vector input.  */
7060 static bool
7061 use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
7062                          tree vectype_in)
7063 {
7064   if (cond_fn != IFN_LAST
7065       && direct_internal_fn_supported_p (cond_fn, vectype_in,
7066                                          OPTIMIZE_FOR_SPEED))
7067     return false;
7068
7069   if (code.is_tree_code ())
7070     switch (tree_code (code))
7071       {
7072       case DOT_PROD_EXPR:
7073       case SAD_EXPR:
7074         return true;
7075
7076       default:
7077         break;
7078       }
7079   return false;
7080 }
7081
7082 /* Insert a conditional expression to enable masked vectorization.  CODE is the
7083    code for the operation.  VOP is the array of operands.  MASK is the loop
7084    mask.  GSI is a statement iterator used to place the new conditional
7085    expression.  */
7086 static void
7087 build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
7088                       gimple_stmt_iterator *gsi)
7089 {
7090   switch (tree_code (code))
7091     {
7092     case DOT_PROD_EXPR:
7093       {
7094         tree vectype = TREE_TYPE (vop[1]);
7095         tree zero = build_zero_cst (vectype);
7096         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7097         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7098                                                mask, vop[1], zero);
7099         gsi_insert_before (gsi, select, GSI_SAME_STMT);
7100         vop[1] = masked_op1;
7101         break;
7102       }
7103
7104     case SAD_EXPR:
7105       {
7106         tree vectype = TREE_TYPE (vop[1]);
7107         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7108         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7109                                                mask, vop[1], vop[0]);
7110         gsi_insert_before (gsi, select, GSI_SAME_STMT);
7111         vop[1] = masked_op1;
7112         break;
7113       }
7114
7115     default:
7116       gcc_unreachable ();
7117     }
7118 }
7119
7120 /* Function vectorizable_reduction.
7121
7122    Check if STMT_INFO performs a reduction operation that can be vectorized.
7123    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
7124    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
7125    Return true if STMT_INFO is vectorizable in this way.
7126
7127    This function also handles reduction idioms (patterns) that have been
7128    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
7129    may be of this form:
7130      X = pattern_expr (arg0, arg1, ..., X)
7131    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
7132    sequence that had been detected and replaced by the pattern-stmt
7133    (STMT_INFO).
7134
7135    This function also handles reduction of condition expressions, for example:
7136      for (int i = 0; i < N; i++)
7137        if (a[i] < value)
7138          last = a[i];
7139    This is handled by vectorising the loop and creating an additional vector
7140    containing the loop indexes for which "a[i] < value" was true.  In the
7141    function epilogue this is reduced to a single max value and then used to
7142    index into the vector of results.
7143
7144    In some cases of reduction patterns, the type of the reduction variable X is
7145    different than the type of the other arguments of STMT_INFO.
7146    In such cases, the vectype that is used when transforming STMT_INFO into
7147    a vector stmt is different than the vectype that is used to determine the
7148    vectorization factor, because it consists of a different number of elements
7149    than the actual number of elements that are being operated upon in parallel.
7150
7151    For example, consider an accumulation of shorts into an int accumulator.
7152    On some targets it's possible to vectorize this pattern operating on 8
7153    shorts at a time (hence, the vectype for purposes of determining the
7154    vectorization factor should be V8HI); on the other hand, the vectype that
7155    is used to create the vector form is actually V4SI (the type of the result).
7156
7157    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
7158    indicates what is the actual level of parallelism (V8HI in the example), so
7159    that the right vectorization factor would be derived.  This vectype
7160    corresponds to the type of arguments to the reduction stmt, and should *NOT*
7161    be used to create the vectorized stmt.  The right vectype for the vectorized
7162    stmt is obtained from the type of the result X:
7163       get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7164
7165    This means that, contrary to "regular" reductions (or "regular" stmts in
7166    general), the following equation:
7167       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7168    does *NOT* necessarily hold for reduction patterns.  */
7169
7170 bool
7171 vectorizable_reduction (loop_vec_info loop_vinfo,
7172                         stmt_vec_info stmt_info, slp_tree slp_node,
7173                         slp_instance slp_node_instance,
7174                         stmt_vector_for_cost *cost_vec)
7175 {
7176   tree vectype_in = NULL_TREE;
7177   tree vectype_op[3] = { NULL_TREE, NULL_TREE, NULL_TREE };
7178   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7179   enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
7180   stmt_vec_info cond_stmt_vinfo = NULL;
7181   int i;
7182   int ncopies;
7183   bool single_defuse_cycle = false;
7184   bool nested_cycle = false;
7185   bool double_reduc = false;
7186   int vec_num;
7187   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
7188   tree cond_reduc_val = NULL_TREE;
7189
7190   /* Make sure it was already recognized as a reduction computation.  */
7191   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
7192       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
7193       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
7194     return false;
7195
7196   /* The stmt we store reduction analysis meta on.  */
7197   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7198   reduc_info->is_reduc_info = true;
7199
7200   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
7201     {
7202       if (is_a <gphi *> (stmt_info->stmt))
7203         {
7204           if (slp_node)
7205             {
7206               /* We eventually need to set a vector type on invariant
7207                  arguments.  */
7208               unsigned j;
7209               slp_tree child;
7210               FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
7211                 if (!vect_maybe_update_slp_op_vectype
7212                        (child, SLP_TREE_VECTYPE (slp_node)))
7213                   {
7214                     if (dump_enabled_p ())
7215                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7216                                        "incompatible vector types for "
7217                                        "invariants\n");
7218                     return false;
7219                   }
7220             }
7221           /* Analysis for double-reduction is done on the outer
7222              loop PHI, nested cycles have no further restrictions.  */
7223           STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
7224         }
7225       else
7226         STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7227       return true;
7228     }
7229
7230   stmt_vec_info orig_stmt_of_analysis = stmt_info;
7231   stmt_vec_info phi_info = stmt_info;
7232   if (!is_a <gphi *> (stmt_info->stmt))
7233     {
7234       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7235       return true;
7236     }
7237   if (slp_node)
7238     {
7239       slp_node_instance->reduc_phis = slp_node;
7240       /* ???  We're leaving slp_node to point to the PHIs, we only
7241          need it to get at the number of vector stmts which wasn't
7242          yet initialized for the instance root.  */
7243     }
7244   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7245     {
7246       use_operand_p use_p;
7247       gimple *use_stmt;
7248       bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
7249                                  &use_p, &use_stmt);
7250       gcc_assert (res);
7251       phi_info = loop_vinfo->lookup_stmt (use_stmt);
7252     }
7253
7254   /* PHIs should not participate in patterns.  */
7255   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7256   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7257
7258   /* Verify following REDUC_IDX from the latch def leads us back to the PHI
7259      and compute the reduction chain length.  Discover the real
7260      reduction operation stmt on the way (stmt_info and slp_for_stmt_info).  */
7261   tree reduc_def
7262     = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
7263                              loop_latch_edge
7264                                (gimple_bb (reduc_def_phi)->loop_father));
7265   unsigned reduc_chain_length = 0;
7266   bool only_slp_reduc_chain = true;
7267   stmt_info = NULL;
7268   slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
7269   while (reduc_def != PHI_RESULT (reduc_def_phi))
7270     {
7271       stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
7272       stmt_vec_info vdef = vect_stmt_to_vectorize (def);
7273       if (STMT_VINFO_REDUC_IDX (vdef) == -1)
7274         {
7275           if (dump_enabled_p ())
7276             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7277                              "reduction chain broken by patterns.\n");
7278           return false;
7279         }
7280       if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
7281         only_slp_reduc_chain = false;
7282       /* For epilogue generation live members of the chain need
7283          to point back to the PHI via their original stmt for
7284          info_for_reduction to work.  For SLP we need to look at
7285          all lanes here - even though we only will vectorize from
7286          the SLP node with live lane zero the other live lanes also
7287          need to be identified as part of a reduction to be able
7288          to skip code generation for them.  */
7289       if (slp_for_stmt_info)
7290         {
7291           for (auto s : SLP_TREE_SCALAR_STMTS (slp_for_stmt_info))
7292             if (STMT_VINFO_LIVE_P (s))
7293               STMT_VINFO_REDUC_DEF (vect_orig_stmt (s)) = phi_info;
7294         }
7295       else if (STMT_VINFO_LIVE_P (vdef))
7296         STMT_VINFO_REDUC_DEF (def) = phi_info;
7297       gimple_match_op op;
7298       if (!gimple_extract_op (vdef->stmt, &op))
7299         {
7300           if (dump_enabled_p ())
7301             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7302                              "reduction chain includes unsupported"
7303                              " statement type.\n");
7304           return false;
7305         }
7306       if (CONVERT_EXPR_CODE_P (op.code))
7307         {
7308           if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
7309             {
7310               if (dump_enabled_p ())
7311                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7312                                  "conversion in the reduction chain.\n");
7313               return false;
7314             }
7315         }
7316       else if (!stmt_info)
7317         /* First non-conversion stmt.  */
7318         stmt_info = vdef;
7319       reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
7320       reduc_chain_length++;
7321       if (!stmt_info && slp_node)
7322         slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
7323     }
7324   /* PHIs should not participate in patterns.  */
7325   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7326
7327   if (nested_in_vect_loop_p (loop, stmt_info))
7328     {
7329       loop = loop->inner;
7330       nested_cycle = true;
7331     }
7332
7333   /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
7334      element.  */
7335   if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7336     {
7337       gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
7338       stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
7339     }
7340   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7341     gcc_assert (slp_node
7342                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
7343
7344   /* 1. Is vectorizable reduction?  */
7345   /* Not supportable if the reduction variable is used in the loop, unless
7346      it's a reduction chain.  */
7347   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
7348       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7349     return false;
7350
7351   /* Reductions that are not used even in an enclosing outer-loop,
7352      are expected to be "live" (used out of the loop).  */
7353   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
7354       && !STMT_VINFO_LIVE_P (stmt_info))
7355     return false;
7356
7357   /* 2. Has this been recognized as a reduction pattern?
7358
7359      Check if STMT represents a pattern that has been recognized
7360      in earlier analysis stages.  For stmts that represent a pattern,
7361      the STMT_VINFO_RELATED_STMT field records the last stmt in
7362      the original sequence that constitutes the pattern.  */
7363
7364   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
7365   if (orig_stmt_info)
7366     {
7367       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
7368       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
7369     }
7370
7371   /* 3. Check the operands of the operation.  The first operands are defined
7372         inside the loop body. The last operand is the reduction variable,
7373         which is defined by the loop-header-phi.  */
7374
7375   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7376   STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
7377   gimple_match_op op;
7378   if (!gimple_extract_op (stmt_info->stmt, &op))
7379     gcc_unreachable ();
7380   bool lane_reduc_code_p = (op.code == DOT_PROD_EXPR
7381                             || op.code == WIDEN_SUM_EXPR
7382                             || op.code == SAD_EXPR);
7383
7384   if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
7385       && !SCALAR_FLOAT_TYPE_P (op.type))
7386     return false;
7387
7388   /* Do not try to vectorize bit-precision reductions.  */
7389   if (!type_has_mode_precision_p (op.type))
7390     return false;
7391
7392   /* For lane-reducing ops we're reducing the number of reduction PHIs
7393      which means the only use of that may be in the lane-reducing operation.  */
7394   if (lane_reduc_code_p
7395       && reduc_chain_length != 1
7396       && !only_slp_reduc_chain)
7397     {
7398       if (dump_enabled_p ())
7399         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7400                          "lane-reducing reduction with extra stmts.\n");
7401       return false;
7402     }
7403
7404   /* All uses but the last are expected to be defined in the loop.
7405      The last use is the reduction variable.  In case of nested cycle this
7406      assumption is not true: we use reduc_index to record the index of the
7407      reduction variable.  */
7408   slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
7409   /* We need to skip an extra operand for COND_EXPRs with embedded
7410      comparison.  */
7411   unsigned opno_adjust = 0;
7412   if (op.code == COND_EXPR && COMPARISON_CLASS_P (op.ops[0]))
7413     opno_adjust = 1;
7414   for (i = 0; i < (int) op.num_ops; i++)
7415     {
7416       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
7417       if (i == 0 && op.code == COND_EXPR)
7418         continue;
7419
7420       stmt_vec_info def_stmt_info;
7421       enum vect_def_type dt;
7422       if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
7423                                i + opno_adjust, &op.ops[i], &slp_op[i], &dt,
7424                                &vectype_op[i], &def_stmt_info))
7425         {
7426           if (dump_enabled_p ())
7427             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7428                              "use not simple.\n");
7429           return false;
7430         }
7431       if (i == STMT_VINFO_REDUC_IDX (stmt_info))
7432         continue;
7433
7434       /* There should be only one cycle def in the stmt, the one
7435          leading to reduc_def.  */
7436       if (VECTORIZABLE_CYCLE_DEF (dt))
7437         return false;
7438
7439       if (!vectype_op[i])
7440         vectype_op[i]
7441           = get_vectype_for_scalar_type (loop_vinfo,
7442                                          TREE_TYPE (op.ops[i]), slp_op[i]);
7443
7444       /* To properly compute ncopies we are interested in the widest
7445          non-reduction input type in case we're looking at a widening
7446          accumulation that we later handle in vect_transform_reduction.  */
7447       if (lane_reduc_code_p
7448           && vectype_op[i]
7449           && (!vectype_in
7450               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
7451                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_op[i]))))))
7452         vectype_in = vectype_op[i];
7453
7454       if (op.code == COND_EXPR)
7455         {
7456           /* Record how the non-reduction-def value of COND_EXPR is defined.  */
7457           if (dt == vect_constant_def)
7458             {
7459               cond_reduc_dt = dt;
7460               cond_reduc_val = op.ops[i];
7461             }
7462           if (dt == vect_induction_def
7463               && def_stmt_info
7464               && is_nonwrapping_integer_induction (def_stmt_info, loop))
7465             {
7466               cond_reduc_dt = dt;
7467               cond_stmt_vinfo = def_stmt_info;
7468             }
7469         }
7470     }
7471   if (!vectype_in)
7472     vectype_in = STMT_VINFO_VECTYPE (phi_info);
7473   STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
7474
7475   enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
7476   STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
7477   /* If we have a condition reduction, see if we can simplify it further.  */
7478   if (v_reduc_type == COND_REDUCTION)
7479     {
7480       if (slp_node)
7481         return false;
7482
7483       /* When the condition uses the reduction value in the condition, fail.  */
7484       if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
7485         {
7486           if (dump_enabled_p ())
7487             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7488                              "condition depends on previous iteration\n");
7489           return false;
7490         }
7491
7492       if (reduc_chain_length == 1
7493           && (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST, vectype_in,
7494                                               OPTIMIZE_FOR_SPEED)
7495               || direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
7496                                                  vectype_in,
7497                                                  OPTIMIZE_FOR_SPEED)))
7498         {
7499           if (dump_enabled_p ())
7500             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7501                              "optimizing condition reduction with"
7502                              " FOLD_EXTRACT_LAST.\n");
7503           STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
7504         }
7505       else if (cond_reduc_dt == vect_induction_def)
7506         {
7507           tree base
7508             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
7509           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
7510
7511           gcc_assert (TREE_CODE (base) == INTEGER_CST
7512                       && TREE_CODE (step) == INTEGER_CST);
7513           cond_reduc_val = NULL_TREE;
7514           enum tree_code cond_reduc_op_code = ERROR_MARK;
7515           tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
7516           if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
7517             ;
7518           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
7519              above base; punt if base is the minimum value of the type for
7520              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
7521           else if (tree_int_cst_sgn (step) == -1)
7522             {
7523               cond_reduc_op_code = MIN_EXPR;
7524               if (tree_int_cst_sgn (base) == -1)
7525                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7526               else if (tree_int_cst_lt (base,
7527                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
7528                 cond_reduc_val
7529                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
7530             }
7531           else
7532             {
7533               cond_reduc_op_code = MAX_EXPR;
7534               if (tree_int_cst_sgn (base) == 1)
7535                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7536               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
7537                                         base))
7538                 cond_reduc_val
7539                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
7540             }
7541           if (cond_reduc_val)
7542             {
7543               if (dump_enabled_p ())
7544                 dump_printf_loc (MSG_NOTE, vect_location,
7545                                  "condition expression based on "
7546                                  "integer induction.\n");
7547               STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
7548               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
7549                 = cond_reduc_val;
7550               STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
7551             }
7552         }
7553       else if (cond_reduc_dt == vect_constant_def)
7554         {
7555           enum vect_def_type cond_initial_dt;
7556           tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
7557           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
7558           if (cond_initial_dt == vect_constant_def
7559               && types_compatible_p (TREE_TYPE (cond_initial_val),
7560                                      TREE_TYPE (cond_reduc_val)))
7561             {
7562               tree e = fold_binary (LE_EXPR, boolean_type_node,
7563                                     cond_initial_val, cond_reduc_val);
7564               if (e && (integer_onep (e) || integer_zerop (e)))
7565                 {
7566                   if (dump_enabled_p ())
7567                     dump_printf_loc (MSG_NOTE, vect_location,
7568                                      "condition expression based on "
7569                                      "compile time constant.\n");
7570                   /* Record reduction code at analysis stage.  */
7571                   STMT_VINFO_REDUC_CODE (reduc_info)
7572                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
7573                   STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
7574                 }
7575             }
7576         }
7577     }
7578
7579   if (STMT_VINFO_LIVE_P (phi_info))
7580     return false;
7581
7582   if (slp_node)
7583     ncopies = 1;
7584   else
7585     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7586
7587   gcc_assert (ncopies >= 1);
7588
7589   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
7590
7591   if (nested_cycle)
7592     {
7593       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
7594                   == vect_double_reduction_def);
7595       double_reduc = true;
7596     }
7597
7598   /* 4.2. Check support for the epilog operation.
7599
7600           If STMT represents a reduction pattern, then the type of the
7601           reduction variable may be different than the type of the rest
7602           of the arguments.  For example, consider the case of accumulation
7603           of shorts into an int accumulator; The original code:
7604                         S1: int_a = (int) short_a;
7605           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
7606
7607           was replaced with:
7608                         STMT: int_acc = widen_sum <short_a, int_acc>
7609
7610           This means that:
7611           1. The tree-code that is used to create the vector operation in the
7612              epilog code (that reduces the partial results) is not the
7613              tree-code of STMT, but is rather the tree-code of the original
7614              stmt from the pattern that STMT is replacing.  I.e, in the example
7615              above we want to use 'widen_sum' in the loop, but 'plus' in the
7616              epilog.
7617           2. The type (mode) we use to check available target support
7618              for the vector operation to be created in the *epilog*, is
7619              determined by the type of the reduction variable (in the example
7620              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7621              However the type (mode) we use to check available target support
7622              for the vector operation to be created *inside the loop*, is
7623              determined by the type of the other arguments to STMT (in the
7624              example we'd check this: optab_handler (widen_sum_optab,
7625              vect_short_mode)).
7626
7627           This is contrary to "regular" reductions, in which the types of all
7628           the arguments are the same as the type of the reduction variable.
7629           For "regular" reductions we can therefore use the same vector type
7630           (and also the same tree-code) when generating the epilog code and
7631           when generating the code inside the loop.  */
7632
7633   code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
7634   STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
7635
7636   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7637   if (reduction_type == TREE_CODE_REDUCTION)
7638     {
7639       /* Check whether it's ok to change the order of the computation.
7640          Generally, when vectorizing a reduction we change the order of the
7641          computation.  This may change the behavior of the program in some
7642          cases, so we need to check that this is ok.  One exception is when
7643          vectorizing an outer-loop: the inner-loop is executed sequentially,
7644          and therefore vectorizing reductions in the inner-loop during
7645          outer-loop vectorization is safe.  Likewise when we are vectorizing
7646          a series of reductions using SLP and the VF is one the reductions
7647          are performed in scalar order.  */
7648       if (slp_node
7649           && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7650           && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
7651         ;
7652       else if (needs_fold_left_reduction_p (op.type, orig_code))
7653         {
7654           /* When vectorizing a reduction chain w/o SLP the reduction PHI
7655              is not directy used in stmt.  */
7656           if (!only_slp_reduc_chain
7657               && reduc_chain_length != 1)
7658             {
7659               if (dump_enabled_p ())
7660                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7661                                  "in-order reduction chain without SLP.\n");
7662               return false;
7663             }
7664           STMT_VINFO_REDUC_TYPE (reduc_info)
7665             = reduction_type = FOLD_LEFT_REDUCTION;
7666         }
7667       else if (!commutative_binary_op_p (orig_code, op.type)
7668                || !associative_binary_op_p (orig_code, op.type))
7669         {
7670           if (dump_enabled_p ())
7671             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7672                             "reduction: not commutative/associative");
7673           return false;
7674         }
7675     }
7676
7677   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7678       && ncopies > 1)
7679     {
7680       if (dump_enabled_p ())
7681         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7682                          "multiple types in double reduction or condition "
7683                          "reduction or fold-left reduction.\n");
7684       return false;
7685     }
7686
7687   internal_fn reduc_fn = IFN_LAST;
7688   if (reduction_type == TREE_CODE_REDUCTION
7689       || reduction_type == FOLD_LEFT_REDUCTION
7690       || reduction_type == INTEGER_INDUC_COND_REDUCTION
7691       || reduction_type == CONST_COND_REDUCTION)
7692     {
7693       if (reduction_type == FOLD_LEFT_REDUCTION
7694           ? fold_left_reduction_fn (orig_code, &reduc_fn)
7695           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7696         {
7697           if (reduc_fn != IFN_LAST
7698               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
7699                                                   OPTIMIZE_FOR_SPEED))
7700             {
7701               if (dump_enabled_p ())
7702                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7703                                  "reduc op not supported by target.\n");
7704
7705               reduc_fn = IFN_LAST;
7706             }
7707         }
7708       else
7709         {
7710           if (!nested_cycle || double_reduc)
7711             {
7712               if (dump_enabled_p ())
7713                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7714                                  "no reduc code for scalar code.\n");
7715
7716               return false;
7717             }
7718         }
7719     }
7720   else if (reduction_type == COND_REDUCTION)
7721     {
7722       int scalar_precision
7723         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
7724       cr_index_scalar_type = make_unsigned_type (scalar_precision);
7725       cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
7726                                                 vectype_out);
7727
7728       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7729                                           OPTIMIZE_FOR_SPEED))
7730         reduc_fn = IFN_REDUC_MAX;
7731     }
7732   STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
7733
7734   if (reduction_type != EXTRACT_LAST_REDUCTION
7735       && (!nested_cycle || double_reduc)
7736       && reduc_fn == IFN_LAST
7737       && !nunits_out.is_constant ())
7738     {
7739       if (dump_enabled_p ())
7740         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7741                          "missing target support for reduction on"
7742                          " variable-length vectors.\n");
7743       return false;
7744     }
7745
7746   /* For SLP reductions, see if there is a neutral value we can use.  */
7747   tree neutral_op = NULL_TREE;
7748   if (slp_node)
7749     {
7750       tree initial_value = NULL_TREE;
7751       if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
7752         initial_value = vect_phi_initial_value (reduc_def_phi);
7753       neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7754                                              orig_code, initial_value);
7755     }
7756
7757   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7758     {
7759       /* We can't support in-order reductions of code such as this:
7760
7761            for (int i = 0; i < n1; ++i)
7762              for (int j = 0; j < n2; ++j)
7763                l += a[j];
7764
7765          since GCC effectively transforms the loop when vectorizing:
7766
7767            for (int i = 0; i < n1 / VF; ++i)
7768              for (int j = 0; j < n2; ++j)
7769                for (int k = 0; k < VF; ++k)
7770                  l += a[j];
7771
7772          which is a reassociation of the original operation.  */
7773       if (dump_enabled_p ())
7774         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7775                          "in-order double reduction not supported.\n");
7776
7777       return false;
7778     }
7779
7780   if (reduction_type == FOLD_LEFT_REDUCTION
7781       && slp_node
7782       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7783     {
7784       /* We cannot use in-order reductions in this case because there is
7785          an implicit reassociation of the operations involved.  */
7786       if (dump_enabled_p ())
7787         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7788                          "in-order unchained SLP reductions not supported.\n");
7789       return false;
7790     }
7791
7792   /* For double reductions, and for SLP reductions with a neutral value,
7793      we construct a variable-length initial vector by loading a vector
7794      full of the neutral value and then shift-and-inserting the start
7795      values into the low-numbered elements.  */
7796   if ((double_reduc || neutral_op)
7797       && !nunits_out.is_constant ()
7798       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7799                                           vectype_out, OPTIMIZE_FOR_SPEED))
7800     {
7801       if (dump_enabled_p ())
7802         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7803                          "reduction on variable-length vectors requires"
7804                          " target support for a vector-shift-and-insert"
7805                          " operation.\n");
7806       return false;
7807     }
7808
7809   /* Check extra constraints for variable-length unchained SLP reductions.  */
7810   if (slp_node
7811       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7812       && !nunits_out.is_constant ())
7813     {
7814       /* We checked above that we could build the initial vector when
7815          there's a neutral element value.  Check here for the case in
7816          which each SLP statement has its own initial value and in which
7817          that value needs to be repeated for every instance of the
7818          statement within the initial vector.  */
7819       unsigned int group_size = SLP_TREE_LANES (slp_node);
7820       if (!neutral_op
7821           && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
7822                                               TREE_TYPE (vectype_out)))
7823         {
7824           if (dump_enabled_p ())
7825             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7826                              "unsupported form of SLP reduction for"
7827                              " variable-length vectors: cannot build"
7828                              " initial vector.\n");
7829           return false;
7830         }
7831       /* The epilogue code relies on the number of elements being a multiple
7832          of the group size.  The duplicate-and-interleave approach to setting
7833          up the initial vector does too.  */
7834       if (!multiple_p (nunits_out, group_size))
7835         {
7836           if (dump_enabled_p ())
7837             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7838                              "unsupported form of SLP reduction for"
7839                              " variable-length vectors: the vector size"
7840                              " is not a multiple of the number of results.\n");
7841           return false;
7842         }
7843     }
7844
7845   if (reduction_type == COND_REDUCTION)
7846     {
7847       widest_int ni;
7848
7849       if (! max_loop_iterations (loop, &ni))
7850         {
7851           if (dump_enabled_p ())
7852             dump_printf_loc (MSG_NOTE, vect_location,
7853                              "loop count not known, cannot create cond "
7854                              "reduction.\n");
7855           return false;
7856         }
7857       /* Convert backedges to iterations.  */
7858       ni += 1;
7859
7860       /* The additional index will be the same type as the condition.  Check
7861          that the loop can fit into this less one (because we'll use up the
7862          zero slot for when there are no matches).  */
7863       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7864       if (wi::geu_p (ni, wi::to_widest (max_index)))
7865         {
7866           if (dump_enabled_p ())
7867             dump_printf_loc (MSG_NOTE, vect_location,
7868                              "loop size is greater than data size.\n");
7869           return false;
7870         }
7871     }
7872
7873   /* In case the vectorization factor (VF) is bigger than the number
7874      of elements that we can fit in a vectype (nunits), we have to generate
7875      more than one vector stmt - i.e - we need to "unroll" the
7876      vector stmt by a factor VF/nunits.  For more details see documentation
7877      in vectorizable_operation.  */
7878
7879   /* If the reduction is used in an outer loop we need to generate
7880      VF intermediate results, like so (e.g. for ncopies=2):
7881         r0 = phi (init, r0)
7882         r1 = phi (init, r1)
7883         r0 = x0 + r0;
7884         r1 = x1 + r1;
7885     (i.e. we generate VF results in 2 registers).
7886     In this case we have a separate def-use cycle for each copy, and therefore
7887     for each copy we get the vector def for the reduction variable from the
7888     respective phi node created for this copy.
7889
7890     Otherwise (the reduction is unused in the loop nest), we can combine
7891     together intermediate results, like so (e.g. for ncopies=2):
7892         r = phi (init, r)
7893         r = x0 + r;
7894         r = x1 + r;
7895    (i.e. we generate VF/2 results in a single register).
7896    In this case for each copy we get the vector def for the reduction variable
7897    from the vectorized reduction operation generated in the previous iteration.
7898
7899    This only works when we see both the reduction PHI and its only consumer
7900    in vectorizable_reduction and there are no intermediate stmts
7901    participating.  When unrolling we want each unrolled iteration to have its
7902    own reduction accumulator since one of the main goals of unrolling a
7903    reduction is to reduce the aggregate loop-carried latency.  */
7904   if (ncopies > 1
7905       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7906       && reduc_chain_length == 1
7907       && loop_vinfo->suggested_unroll_factor == 1)
7908     single_defuse_cycle = true;
7909
7910   if (single_defuse_cycle || lane_reduc_code_p)
7911     {
7912       gcc_assert (op.code != COND_EXPR);
7913
7914       /* 4. Supportable by target?  */
7915       bool ok = true;
7916
7917       /* 4.1. check support for the operation in the loop
7918
7919          This isn't necessary for the lane reduction codes, since they
7920          can only be produced by pattern matching, and it's up to the
7921          pattern matcher to test for support.  The main reason for
7922          specifically skipping this step is to avoid rechecking whether
7923          mixed-sign dot-products can be implemented using signed
7924          dot-products.  */
7925       machine_mode vec_mode = TYPE_MODE (vectype_in);
7926       if (!lane_reduc_code_p
7927           && !directly_supported_p (op.code, vectype_in, optab_vector))
7928         {
7929           if (dump_enabled_p ())
7930             dump_printf (MSG_NOTE, "op not supported by target.\n");
7931           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
7932               || !vect_can_vectorize_without_simd_p (op.code))
7933             ok = false;
7934           else
7935             if (dump_enabled_p ())
7936               dump_printf (MSG_NOTE, "proceeding using word mode.\n");
7937         }
7938
7939       if (vect_emulated_vector_p (vectype_in)
7940           && !vect_can_vectorize_without_simd_p (op.code))
7941         {
7942           if (dump_enabled_p ())
7943             dump_printf (MSG_NOTE, "using word mode not possible.\n");
7944           return false;
7945         }
7946
7947       /* lane-reducing operations have to go through vect_transform_reduction.
7948          For the other cases try without the single cycle optimization.  */
7949       if (!ok)
7950         {
7951           if (lane_reduc_code_p)
7952             return false;
7953           else
7954             single_defuse_cycle = false;
7955         }
7956     }
7957   STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
7958
7959   /* If the reduction stmt is one of the patterns that have lane
7960      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
7961   if ((ncopies > 1 && ! single_defuse_cycle)
7962       && lane_reduc_code_p)
7963     {
7964       if (dump_enabled_p ())
7965         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7966                          "multi def-use cycle not possible for lane-reducing "
7967                          "reduction operation\n");
7968       return false;
7969     }
7970
7971   if (slp_node
7972       && !(!single_defuse_cycle
7973            && !lane_reduc_code_p
7974            && reduction_type != FOLD_LEFT_REDUCTION))
7975     for (i = 0; i < (int) op.num_ops; i++)
7976       if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
7977         {
7978           if (dump_enabled_p ())
7979             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7980                              "incompatible vector types for invariants\n");
7981           return false;
7982         }
7983
7984   if (slp_node)
7985     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7986   else
7987     vec_num = 1;
7988
7989   vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
7990                              reduction_type, ncopies, cost_vec);
7991   /* Cost the reduction op inside the loop if transformed via
7992      vect_transform_reduction.  Otherwise this is costed by the
7993      separate vectorizable_* routines.  */
7994   if (single_defuse_cycle || lane_reduc_code_p)
7995     {
7996       int factor = 1;
7997       if (vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info))
7998         /* Three dot-products and a subtraction.  */
7999         factor = 4;
8000       record_stmt_cost (cost_vec, ncopies * factor, vector_stmt,
8001                         stmt_info, 0, vect_body);
8002     }
8003
8004   if (dump_enabled_p ()
8005       && reduction_type == FOLD_LEFT_REDUCTION)
8006     dump_printf_loc (MSG_NOTE, vect_location,
8007                      "using an in-order (fold-left) reduction.\n");
8008   STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
8009   /* All but single defuse-cycle optimized, lane-reducing and fold-left
8010      reductions go through their own vectorizable_* routines.  */
8011   if (!single_defuse_cycle
8012       && !lane_reduc_code_p
8013       && reduction_type != FOLD_LEFT_REDUCTION)
8014     {
8015       stmt_vec_info tem
8016         = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
8017       if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
8018         {
8019           gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
8020           tem = REDUC_GROUP_FIRST_ELEMENT (tem);
8021         }
8022       STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
8023       STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
8024     }
8025   else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8026     {
8027       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8028       vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8029       internal_fn cond_fn = get_conditional_internal_fn (op.code, op.type);
8030
8031       if (reduction_type != FOLD_LEFT_REDUCTION
8032           && !use_mask_by_cond_expr_p (op.code, cond_fn, vectype_in)
8033           && (cond_fn == IFN_LAST
8034               || !direct_internal_fn_supported_p (cond_fn, vectype_in,
8035                                                   OPTIMIZE_FOR_SPEED)))
8036         {
8037           if (dump_enabled_p ())
8038             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8039                              "can't operate on partial vectors because"
8040                              " no conditional operation is available.\n");
8041           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8042         }
8043       else if (reduction_type == FOLD_LEFT_REDUCTION
8044                && reduc_fn == IFN_LAST
8045                && !expand_vec_cond_expr_p (vectype_in,
8046                                            truth_type_for (vectype_in),
8047                                            SSA_NAME))
8048         {
8049           if (dump_enabled_p ())
8050             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8051                              "can't operate on partial vectors because"
8052                              " no conditional operation is available.\n");
8053           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8054         }
8055       else if (reduction_type == FOLD_LEFT_REDUCTION
8056                && reduc_fn == IFN_LAST
8057                && FLOAT_TYPE_P (vectype_in)
8058                && HONOR_SIGNED_ZEROS (vectype_in)
8059                && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in))
8060         {
8061           if (dump_enabled_p ())
8062             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8063                              "can't operate on partial vectors because"
8064                              " signed zeros cannot be preserved.\n");
8065           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8066         }
8067       else
8068         {
8069           internal_fn mask_reduc_fn
8070             = get_masked_reduction_fn (reduc_fn, vectype_in);
8071
8072           if (mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
8073             vect_record_loop_len (loop_vinfo, lens, ncopies * vec_num,
8074                                   vectype_in, 1);
8075           else
8076             vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
8077                                    vectype_in, NULL);
8078         }
8079     }
8080   return true;
8081 }
8082
8083 /* STMT_INFO is a dot-product reduction whose multiplication operands
8084    have different signs.  Emit a sequence to emulate the operation
8085    using a series of signed DOT_PROD_EXPRs and return the last
8086    statement generated.  VEC_DEST is the result of the vector operation
8087    and VOP lists its inputs.  */
8088
8089 static gassign *
8090 vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8091                              gimple_stmt_iterator *gsi, tree vec_dest,
8092                              tree vop[3])
8093 {
8094   tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest));
8095   tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0]));
8096   tree narrow_elttype = TREE_TYPE (narrow_vectype);
8097   gimple *new_stmt;
8098
8099   /* Make VOP[0] the unsigned operand VOP[1] the signed operand.  */
8100   if (!TYPE_UNSIGNED (TREE_TYPE (vop[0])))
8101     std::swap (vop[0], vop[1]);
8102
8103   /* Convert all inputs to signed types.  */
8104   for (int i = 0; i < 3; ++i)
8105     if (TYPE_UNSIGNED (TREE_TYPE (vop[i])))
8106       {
8107         tree tmp = make_ssa_name (signed_type_for (TREE_TYPE (vop[i])));
8108         new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]);
8109         vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8110         vop[i] = tmp;
8111       }
8112
8113   /* In the comments below we assume 8-bit inputs for simplicity,
8114      but the approach works for any full integer type.  */
8115
8116   /* Create a vector of -128.  */
8117   tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype);
8118   tree min_narrow = build_vector_from_val (narrow_vectype,
8119                                            min_narrow_elttype);
8120
8121   /* Create a vector of 64.  */
8122   auto half_wi = wi::lrshift (wi::to_wide (min_narrow_elttype), 1);
8123   tree half_narrow = wide_int_to_tree (narrow_elttype, half_wi);
8124   half_narrow = build_vector_from_val (narrow_vectype, half_narrow);
8125
8126   /* Emit: SUB_RES = VOP[0] - 128.  */
8127   tree sub_res = make_ssa_name (narrow_vectype);
8128   new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow);
8129   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8130
8131   /* Emit:
8132
8133        STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
8134        STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
8135        STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
8136
8137      on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
8138      Doing the two 64 * y steps first allows more time to compute x.  */
8139   tree stage1 = make_ssa_name (wide_vectype);
8140   new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR,
8141                                   vop[1], half_narrow, vop[2]);
8142   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8143
8144   tree stage2 = make_ssa_name (wide_vectype);
8145   new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR,
8146                                   vop[1], half_narrow, stage1);
8147   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8148
8149   tree stage3 = make_ssa_name (wide_vectype);
8150   new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR,
8151                                   sub_res, vop[1], stage2);
8152   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8153
8154   /* Convert STAGE3 to the reduction type.  */
8155   return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3);
8156 }
8157
8158 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
8159    value.  */
8160
8161 bool
8162 vect_transform_reduction (loop_vec_info loop_vinfo,
8163                           stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
8164                           gimple **vec_stmt, slp_tree slp_node)
8165 {
8166   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8167   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8168   int i;
8169   int ncopies;
8170   int vec_num;
8171
8172   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8173   gcc_assert (reduc_info->is_reduc_info);
8174
8175   if (nested_in_vect_loop_p (loop, stmt_info))
8176     {
8177       loop = loop->inner;
8178       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
8179     }
8180
8181   gimple_match_op op;
8182   if (!gimple_extract_op (stmt_info->stmt, &op))
8183     gcc_unreachable ();
8184
8185   /* All uses but the last are expected to be defined in the loop.
8186      The last use is the reduction variable.  In case of nested cycle this
8187      assumption is not true: we use reduc_index to record the index of the
8188      reduction variable.  */
8189   stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
8190   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
8191   int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
8192   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8193
8194   if (slp_node)
8195     {
8196       ncopies = 1;
8197       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8198     }
8199   else
8200     {
8201       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8202       vec_num = 1;
8203     }
8204
8205   code_helper code = canonicalize_code (op.code, op.type);
8206   internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
8207   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8208   vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8209   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
8210
8211   /* Transform.  */
8212   tree new_temp = NULL_TREE;
8213   auto_vec<tree> vec_oprnds0;
8214   auto_vec<tree> vec_oprnds1;
8215   auto_vec<tree> vec_oprnds2;
8216   tree def0;
8217
8218   if (dump_enabled_p ())
8219     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
8220
8221   /* FORNOW: Multiple types are not supported for condition.  */
8222   if (code == COND_EXPR)
8223     gcc_assert (ncopies == 1);
8224
8225   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8226
8227   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
8228   if (reduction_type == FOLD_LEFT_REDUCTION)
8229     {
8230       internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
8231       gcc_assert (code.is_tree_code ());
8232       return vectorize_fold_left_reduction
8233           (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
8234            tree_code (code), reduc_fn, op.ops, vectype_in, reduc_index, masks,
8235            lens);
8236     }
8237
8238   bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
8239   gcc_assert (single_defuse_cycle
8240               || code == DOT_PROD_EXPR
8241               || code == WIDEN_SUM_EXPR
8242               || code == SAD_EXPR);
8243
8244   /* Create the destination vector  */
8245   tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
8246   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
8247
8248   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
8249                      single_defuse_cycle && reduc_index == 0
8250                      ? NULL_TREE : op.ops[0], &vec_oprnds0,
8251                      single_defuse_cycle && reduc_index == 1
8252                      ? NULL_TREE : op.ops[1], &vec_oprnds1,
8253                      op.num_ops == 3
8254                      && !(single_defuse_cycle && reduc_index == 2)
8255                      ? op.ops[2] : NULL_TREE, &vec_oprnds2);
8256   if (single_defuse_cycle)
8257     {
8258       gcc_assert (!slp_node);
8259       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8260                                      op.ops[reduc_index],
8261                                      reduc_index == 0 ? &vec_oprnds0
8262                                      : (reduc_index == 1 ? &vec_oprnds1
8263                                         : &vec_oprnds2));
8264     }
8265
8266   bool emulated_mixed_dot_prod
8267     = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
8268   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
8269     {
8270       gimple *new_stmt;
8271       tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
8272       if (masked_loop_p && !mask_by_cond_expr)
8273         {
8274           /* No conditional ifns have been defined for dot-product yet.  */
8275           gcc_assert (code != DOT_PROD_EXPR);
8276
8277           /* Make sure that the reduction accumulator is vop[0].  */
8278           if (reduc_index == 1)
8279             {
8280               gcc_assert (commutative_binary_op_p (code, op.type));
8281               std::swap (vop[0], vop[1]);
8282             }
8283           tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8284                                           vec_num * ncopies, vectype_in, i);
8285           gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
8286                                                     vop[0], vop[1], vop[0]);
8287           new_temp = make_ssa_name (vec_dest, call);
8288           gimple_call_set_lhs (call, new_temp);
8289           gimple_call_set_nothrow (call, true);
8290           vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
8291           new_stmt = call;
8292         }
8293       else
8294         {
8295           if (op.num_ops == 3)
8296             vop[2] = vec_oprnds2[i];
8297
8298           if (masked_loop_p && mask_by_cond_expr)
8299             {
8300               tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8301                                               vec_num * ncopies, vectype_in, i);
8302               build_vect_cond_expr (code, vop, mask, gsi);
8303             }
8304
8305           if (emulated_mixed_dot_prod)
8306             new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
8307                                                     vec_dest, vop);
8308           else if (code.is_internal_fn ())
8309             new_stmt = gimple_build_call_internal (internal_fn (code),
8310                                                    op.num_ops,
8311                                                    vop[0], vop[1], vop[2]);
8312           else
8313             new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
8314                                             vop[0], vop[1], vop[2]);
8315           new_temp = make_ssa_name (vec_dest, new_stmt);
8316           gimple_set_lhs (new_stmt, new_temp);
8317           vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8318         }
8319
8320       if (slp_node)
8321         slp_node->push_vec_def (new_stmt);
8322       else if (single_defuse_cycle
8323                && i < ncopies - 1)
8324         {
8325           if (reduc_index == 0)
8326             vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
8327           else if (reduc_index == 1)
8328             vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
8329           else if (reduc_index == 2)
8330             vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
8331         }
8332       else
8333         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8334     }
8335
8336   if (!slp_node)
8337     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8338
8339   return true;
8340 }
8341
8342 /* Transform phase of a cycle PHI.  */
8343
8344 bool
8345 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
8346                           stmt_vec_info stmt_info, gimple **vec_stmt,
8347                           slp_tree slp_node, slp_instance slp_node_instance)
8348 {
8349   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8350   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8351   int i;
8352   int ncopies;
8353   int j;
8354   bool nested_cycle = false;
8355   int vec_num;
8356
8357   if (nested_in_vect_loop_p (loop, stmt_info))
8358     {
8359       loop = loop->inner;
8360       nested_cycle = true;
8361     }
8362
8363   stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
8364   reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
8365   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8366   gcc_assert (reduc_info->is_reduc_info);
8367
8368   if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
8369       || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
8370     /* Leave the scalar phi in place.  */
8371     return true;
8372
8373   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8374   /* For a nested cycle we do not fill the above.  */
8375   if (!vectype_in)
8376     vectype_in = STMT_VINFO_VECTYPE (stmt_info);
8377   gcc_assert (vectype_in);
8378
8379   if (slp_node)
8380     {
8381       /* The size vect_schedule_slp_instance computes is off for us.  */
8382       vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
8383                                       * SLP_TREE_LANES (slp_node), vectype_in);
8384       ncopies = 1;
8385     }
8386   else
8387     {
8388       vec_num = 1;
8389       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8390     }
8391
8392   /* Check whether we should use a single PHI node and accumulate
8393      vectors to one before the backedge.  */
8394   if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
8395     ncopies = 1;
8396
8397   /* Create the destination vector  */
8398   gphi *phi = as_a <gphi *> (stmt_info->stmt);
8399   tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
8400                                                vectype_out);
8401
8402   /* Get the loop-entry arguments.  */
8403   tree vec_initial_def = NULL_TREE;
8404   auto_vec<tree> vec_initial_defs;
8405   if (slp_node)
8406     {
8407       vec_initial_defs.reserve (vec_num);
8408       if (nested_cycle)
8409         {
8410           unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
8411           vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
8412                              &vec_initial_defs);
8413         }
8414       else
8415         {
8416           gcc_assert (slp_node == slp_node_instance->reduc_phis);
8417           vec<tree> &initial_values = reduc_info->reduc_initial_values;
8418           vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
8419
8420           unsigned int num_phis = stmts.length ();
8421           if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
8422             num_phis = 1;
8423           initial_values.reserve (num_phis);
8424           for (unsigned int i = 0; i < num_phis; ++i)
8425             {
8426               gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
8427               initial_values.quick_push (vect_phi_initial_value (this_phi));
8428             }
8429           if (vec_num == 1)
8430             vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8431           if (!initial_values.is_empty ())
8432             {
8433               tree initial_value
8434                 = (num_phis == 1 ? initial_values[0] : NULL_TREE);
8435               code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8436               tree neutral_op
8437                 = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8438                                             code, initial_value);
8439               get_initial_defs_for_reduction (loop_vinfo, reduc_info,
8440                                               &vec_initial_defs, vec_num,
8441                                               stmts.length (), neutral_op);
8442             }
8443         }
8444     }
8445   else
8446     {
8447       /* Get at the scalar def before the loop, that defines the initial
8448          value of the reduction variable.  */
8449       tree initial_def = vect_phi_initial_value (phi);
8450       reduc_info->reduc_initial_values.safe_push (initial_def);
8451       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
8452          and we can't use zero for induc_val, use initial_def.  Similarly
8453          for REDUC_MIN and initial_def larger than the base.  */
8454       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
8455         {
8456           tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
8457           if (TREE_CODE (initial_def) == INTEGER_CST
8458               && !integer_zerop (induc_val)
8459               && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
8460                    && tree_int_cst_lt (initial_def, induc_val))
8461                   || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
8462                       && tree_int_cst_lt (induc_val, initial_def))))
8463             {
8464               induc_val = initial_def;
8465               /* Communicate we used the initial_def to epilouge
8466                  generation.  */
8467               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
8468             }
8469           vec_initial_def = build_vector_from_val (vectype_out, induc_val);
8470         }
8471       else if (nested_cycle)
8472         {
8473           /* Do not use an adjustment def as that case is not supported
8474              correctly if ncopies is not one.  */
8475           vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
8476                                          ncopies, initial_def,
8477                                          &vec_initial_defs);
8478         }
8479       else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
8480                || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
8481         /* Fill the initial vector with the initial scalar value.  */
8482         vec_initial_def
8483           = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
8484                                            initial_def, initial_def);
8485       else
8486         {
8487           if (ncopies == 1)
8488             vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8489           if (!reduc_info->reduc_initial_values.is_empty ())
8490             {
8491               initial_def = reduc_info->reduc_initial_values[0];
8492               code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8493               tree neutral_op
8494                 = neutral_op_for_reduction (TREE_TYPE (initial_def),
8495                                             code, initial_def);
8496               gcc_assert (neutral_op);
8497               /* Try to simplify the vector initialization by applying an
8498                  adjustment after the reduction has been performed.  */
8499               if (!reduc_info->reused_accumulator
8500                   && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8501                   && !operand_equal_p (neutral_op, initial_def))
8502                 {
8503                   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
8504                     = initial_def;
8505                   initial_def = neutral_op;
8506                 }
8507               vec_initial_def
8508                 = get_initial_def_for_reduction (loop_vinfo, reduc_info,
8509                                                  initial_def, neutral_op);
8510             }
8511         }
8512     }
8513
8514   if (vec_initial_def)
8515     {
8516       vec_initial_defs.create (ncopies);
8517       for (i = 0; i < ncopies; ++i)
8518         vec_initial_defs.quick_push (vec_initial_def);
8519     }
8520
8521   if (auto *accumulator = reduc_info->reused_accumulator)
8522     {
8523       tree def = accumulator->reduc_input;
8524       if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8525         {
8526           unsigned int nreduc;
8527           bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
8528                                             (TREE_TYPE (def)),
8529                                           TYPE_VECTOR_SUBPARTS (vectype_out),
8530                                           &nreduc);
8531           gcc_assert (res);
8532           gimple_seq stmts = NULL;
8533           /* Reduce the single vector to a smaller one.  */
8534           if (nreduc != 1)
8535             {
8536               /* Perform the reduction in the appropriate type.  */
8537               tree rvectype = vectype_out;
8538               if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
8539                                               TREE_TYPE (TREE_TYPE (def))))
8540                 rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
8541                                               TYPE_VECTOR_SUBPARTS
8542                                                 (vectype_out));
8543               def = vect_create_partial_epilog (def, rvectype,
8544                                                 STMT_VINFO_REDUC_CODE
8545                                                   (reduc_info),
8546                                                 &stmts);
8547             }
8548           /* The epilogue loop might use a different vector mode, like
8549              VNx2DI vs. V2DI.  */
8550           if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
8551             {
8552               tree reduc_type = build_vector_type_for_mode
8553                 (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
8554               def = gimple_convert (&stmts, reduc_type, def);
8555             }
8556           /* Adjust the input so we pick up the partially reduced value
8557              for the skip edge in vect_create_epilog_for_reduction.  */
8558           accumulator->reduc_input = def;
8559           /* And the reduction could be carried out using a different sign.  */
8560           if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8561             def = gimple_convert (&stmts, vectype_out, def);
8562           if (loop_vinfo->main_loop_edge)
8563             {
8564               /* While we'd like to insert on the edge this will split
8565                  blocks and disturb bookkeeping, we also will eventually
8566                  need this on the skip edge.  Rely on sinking to
8567                  fixup optimal placement and insert in the pred.  */
8568               gimple_stmt_iterator gsi
8569                 = gsi_last_bb (loop_vinfo->main_loop_edge->src);
8570               /* Insert before a cond that eventually skips the
8571                  epilogue.  */
8572               if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
8573                 gsi_prev (&gsi);
8574               gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
8575             }
8576           else
8577             gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
8578                                               stmts);
8579         }
8580       if (loop_vinfo->main_loop_edge)
8581         vec_initial_defs[0]
8582           = vect_get_main_loop_result (loop_vinfo, def,
8583                                        vec_initial_defs[0]);
8584       else
8585         vec_initial_defs.safe_push (def);
8586     }
8587
8588   /* Generate the reduction PHIs upfront.  */
8589   for (i = 0; i < vec_num; i++)
8590     {
8591       tree vec_init_def = vec_initial_defs[i];
8592       for (j = 0; j < ncopies; j++)
8593         {
8594           /* Create the reduction-phi that defines the reduction
8595              operand.  */
8596           gphi *new_phi = create_phi_node (vec_dest, loop->header);
8597
8598           /* Set the loop-entry arg of the reduction-phi.  */
8599           if (j != 0 && nested_cycle)
8600             vec_init_def = vec_initial_defs[j];
8601           add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
8602                        UNKNOWN_LOCATION);
8603
8604           /* The loop-latch arg is set in epilogue processing.  */
8605
8606           if (slp_node)
8607             slp_node->push_vec_def (new_phi);
8608           else
8609             {
8610               if (j == 0)
8611                 *vec_stmt = new_phi;
8612               STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8613             }
8614         }
8615     }
8616
8617   return true;
8618 }
8619
8620 /* Vectorizes LC PHIs.  */
8621
8622 bool
8623 vectorizable_lc_phi (loop_vec_info loop_vinfo,
8624                      stmt_vec_info stmt_info, gimple **vec_stmt,
8625                      slp_tree slp_node)
8626 {
8627   if (!loop_vinfo
8628       || !is_a <gphi *> (stmt_info->stmt)
8629       || gimple_phi_num_args (stmt_info->stmt) != 1)
8630     return false;
8631
8632   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
8633       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
8634     return false;
8635
8636   if (!vec_stmt) /* transformation not required.  */
8637     {
8638       /* Deal with copies from externs or constants that disguise as
8639          loop-closed PHI nodes (PR97886).  */
8640       if (slp_node
8641           && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
8642                                                 SLP_TREE_VECTYPE (slp_node)))
8643         {
8644           if (dump_enabled_p ())
8645             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8646                              "incompatible vector types for invariants\n");
8647           return false;
8648         }
8649       STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
8650       return true;
8651     }
8652
8653   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8654   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8655   basic_block bb = gimple_bb (stmt_info->stmt);
8656   edge e = single_pred_edge (bb);
8657   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8658   auto_vec<tree> vec_oprnds;
8659   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
8660                      !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
8661                      gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
8662   for (unsigned i = 0; i < vec_oprnds.length (); i++)
8663     {
8664       /* Create the vectorized LC PHI node.  */
8665       gphi *new_phi = create_phi_node (vec_dest, bb);
8666       add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
8667       if (slp_node)
8668         slp_node->push_vec_def (new_phi);
8669       else
8670         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8671     }
8672   if (!slp_node)
8673     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8674
8675   return true;
8676 }
8677
8678 /* Vectorizes PHIs.  */
8679
8680 bool
8681 vectorizable_phi (vec_info *,
8682                   stmt_vec_info stmt_info, gimple **vec_stmt,
8683                   slp_tree slp_node, stmt_vector_for_cost *cost_vec)
8684 {
8685   if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
8686     return false;
8687
8688   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
8689     return false;
8690
8691   tree vectype = SLP_TREE_VECTYPE (slp_node);
8692
8693   if (!vec_stmt) /* transformation not required.  */
8694     {
8695       slp_tree child;
8696       unsigned i;
8697       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
8698         if (!child)
8699           {
8700             if (dump_enabled_p ())
8701               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8702                                "PHI node with unvectorized backedge def\n");
8703             return false;
8704           }
8705         else if (!vect_maybe_update_slp_op_vectype (child, vectype))
8706           {
8707             if (dump_enabled_p ())
8708               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8709                                "incompatible vector types for invariants\n");
8710             return false;
8711           }
8712         else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8713                  && !useless_type_conversion_p (vectype,
8714                                                 SLP_TREE_VECTYPE (child)))
8715           {
8716             /* With bools we can have mask and non-mask precision vectors
8717                or different non-mask precisions.  while pattern recog is
8718                supposed to guarantee consistency here bugs in it can cause
8719                mismatches (PR103489 and PR103800 for example).
8720                Deal with them here instead of ICEing later.  */
8721             if (dump_enabled_p ())
8722               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8723                                "incompatible vector type setup from "
8724                                "bool pattern detection\n");
8725             return false;
8726           }
8727
8728       /* For single-argument PHIs assume coalescing which means zero cost
8729          for the scalar and the vector PHIs.  This avoids artificially
8730          favoring the vector path (but may pessimize it in some cases).  */
8731       if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
8732         record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
8733                           vector_stmt, stmt_info, vectype, 0, vect_body);
8734       STMT_VINFO_TYPE (stmt_info) = phi_info_type;
8735       return true;
8736     }
8737
8738   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8739   basic_block bb = gimple_bb (stmt_info->stmt);
8740   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8741   auto_vec<gphi *> new_phis;
8742   for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
8743     {
8744       slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8745
8746       /* Skip not yet vectorized defs.  */
8747       if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8748           && SLP_TREE_VEC_DEFS (child).is_empty ())
8749         continue;
8750
8751       auto_vec<tree> vec_oprnds;
8752       vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
8753       if (!new_phis.exists ())
8754         {
8755           new_phis.create (vec_oprnds.length ());
8756           for (unsigned j = 0; j < vec_oprnds.length (); j++)
8757             {
8758               /* Create the vectorized LC PHI node.  */
8759               new_phis.quick_push (create_phi_node (vec_dest, bb));
8760               slp_node->push_vec_def (new_phis[j]);
8761             }
8762         }
8763       edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
8764       for (unsigned j = 0; j < vec_oprnds.length (); j++)
8765         add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
8766     }
8767   /* We should have at least one already vectorized child.  */
8768   gcc_assert (new_phis.exists ());
8769
8770   return true;
8771 }
8772
8773 /* Vectorizes first order recurrences.  An overview of the transformation
8774    is described below. Suppose we have the following loop.
8775
8776      int t = 0;
8777      for (int i = 0; i < n; ++i)
8778        {
8779          b[i] = a[i] - t;
8780          t = a[i];
8781        }
8782
8783    There is a first-order recurrence on 'a'. For this loop, the scalar IR
8784    looks (simplified) like:
8785
8786     scalar.preheader:
8787       init = 0;
8788
8789     scalar.body:
8790       i = PHI <0(scalar.preheader), i+1(scalar.body)>
8791       _2 = PHI <(init(scalar.preheader), <_1(scalar.body)>
8792       _1 = a[i]
8793       b[i] = _1 - _2
8794       if (i < n) goto scalar.body
8795
8796    In this example, _2 is a recurrence because it's value depends on the
8797    previous iteration.  We vectorize this as (VF = 4)
8798
8799     vector.preheader:
8800       vect_init = vect_cst(..., ..., ..., 0)
8801
8802     vector.body
8803       i = PHI <0(vector.preheader), i+4(vector.body)>
8804       vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)>
8805       vect_2 = a[i, i+1, i+2, i+3];
8806       vect_3 = vec_perm (vect_1, vect_2, { 3, 4, 5, 6 })
8807       b[i, i+1, i+2, i+3] = vect_2 - vect_3
8808       if (..) goto vector.body
8809
8810    In this function, vectorizable_recurr, we code generate both the
8811    vector PHI node and the permute since those together compute the
8812    vectorized value of the scalar PHI.  We do not yet have the
8813    backedge value to fill in there nor into the vec_perm.  Those
8814    are filled in maybe_set_vectorized_backedge_value and
8815    vect_schedule_scc.
8816
8817    TODO:  Since the scalar loop does not have a use of the recurrence
8818    outside of the loop the natural way to implement peeling via
8819    vectorizing the live value doesn't work.  For now peeling of loops
8820    with a recurrence is not implemented.  For SLP the supported cases
8821    are restricted to those requiring a single vector recurrence PHI.  */
8822
8823 bool
8824 vectorizable_recurr (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8825                      gimple **vec_stmt, slp_tree slp_node,
8826                      stmt_vector_for_cost *cost_vec)
8827 {
8828   if (!loop_vinfo || !is_a<gphi *> (stmt_info->stmt))
8829     return false;
8830
8831   gphi *phi = as_a<gphi *> (stmt_info->stmt);
8832
8833   /* So far we only support first-order recurrence auto-vectorization.  */
8834   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
8835     return false;
8836
8837   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8838   unsigned ncopies;
8839   if (slp_node)
8840     ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8841   else
8842     ncopies = vect_get_num_copies (loop_vinfo, vectype);
8843   poly_int64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8844   unsigned dist = slp_node ? SLP_TREE_LANES (slp_node) : 1;
8845   /* We need to be able to make progress with a single vector.  */
8846   if (maybe_gt (dist * 2, nunits))
8847     {
8848       if (dump_enabled_p ())
8849         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8850                          "first order recurrence exceeds half of "
8851                          "a vector\n");
8852       return false;
8853     }
8854
8855   /* First-order recurrence autovectorization needs to handle permutation
8856      with indices = [nunits-1, nunits, nunits+1, ...].  */
8857   vec_perm_builder sel (nunits, 1, 3);
8858   for (int i = 0; i < 3; ++i)
8859     sel.quick_push (nunits - dist + i);
8860   vec_perm_indices indices (sel, 2, nunits);
8861
8862   if (!vec_stmt) /* transformation not required.  */
8863     {
8864       if (!can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
8865                                  indices))
8866         return false;
8867
8868       if (slp_node)
8869         {
8870           /* We eventually need to set a vector type on invariant
8871              arguments.  */
8872           unsigned j;
8873           slp_tree child;
8874           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
8875             if (!vect_maybe_update_slp_op_vectype
8876                   (child, SLP_TREE_VECTYPE (slp_node)))
8877               {
8878                 if (dump_enabled_p ())
8879                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8880                                    "incompatible vector types for "
8881                                    "invariants\n");
8882                 return false;
8883               }
8884         }
8885       /* The recurrence costs the initialization vector and one permute
8886          for each copy.  */
8887       unsigned prologue_cost = record_stmt_cost (cost_vec, 1, scalar_to_vec,
8888                                                  stmt_info, 0, vect_prologue);
8889       unsigned inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
8890                                                stmt_info, 0, vect_body);
8891       if (dump_enabled_p ())
8892         dump_printf_loc (MSG_NOTE, vect_location,
8893                          "vectorizable_recurr: inside_cost = %d, "
8894                          "prologue_cost = %d .\n", inside_cost,
8895                          prologue_cost);
8896
8897       STMT_VINFO_TYPE (stmt_info) = recurr_info_type;
8898       return true;
8899     }
8900
8901   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
8902   basic_block bb = gimple_bb (phi);
8903   tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
8904   if (!useless_type_conversion_p (TREE_TYPE (vectype), TREE_TYPE (preheader)))
8905     {
8906       gimple_seq stmts = NULL;
8907       preheader = gimple_convert (&stmts, TREE_TYPE (vectype), preheader);
8908       gsi_insert_seq_on_edge_immediate (pe, stmts);
8909     }
8910   tree vec_init = build_vector_from_val (vectype, preheader);
8911   vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL);
8912
8913   /* Create the vectorized first-order PHI node.  */
8914   tree vec_dest = vect_get_new_vect_var (vectype,
8915                                          vect_simple_var, "vec_recur_");
8916   gphi *new_phi = create_phi_node (vec_dest, bb);
8917   add_phi_arg (new_phi, vec_init, pe, UNKNOWN_LOCATION);
8918
8919   /* Insert shuffles the first-order recurrence autovectorization.
8920        result = VEC_PERM <vec_recur, vect_1, index[nunits-1, nunits, ...]>.  */
8921   tree perm = vect_gen_perm_mask_checked (vectype, indices);
8922
8923   /* Insert the required permute after the latch definition.  The
8924      second and later operands are tentative and will be updated when we have
8925      vectorized the latch definition.  */
8926   edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
8927   gimple *latch_def = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, le));
8928   gimple_stmt_iterator gsi2 = gsi_for_stmt (latch_def);
8929   gsi_next (&gsi2);
8930
8931   for (unsigned i = 0; i < ncopies; ++i)
8932     {
8933       vec_dest = make_ssa_name (vectype);
8934       gassign *vperm
8935           = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
8936                                  i == 0 ? gimple_phi_result (new_phi) : NULL,
8937                                  NULL, perm);
8938       vect_finish_stmt_generation (loop_vinfo, stmt_info, vperm, &gsi2);
8939
8940       if (slp_node)
8941         slp_node->push_vec_def (vperm);
8942       else
8943         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (vperm);
8944     }
8945
8946   if (!slp_node)
8947     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8948   return true;
8949 }
8950
8951 /* Return true if VECTYPE represents a vector that requires lowering
8952    by the vector lowering pass.  */
8953
8954 bool
8955 vect_emulated_vector_p (tree vectype)
8956 {
8957   return (!VECTOR_MODE_P (TYPE_MODE (vectype))
8958           && (!VECTOR_BOOLEAN_TYPE_P (vectype)
8959               || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
8960 }
8961
8962 /* Return true if we can emulate CODE on an integer mode representation
8963    of a vector.  */
8964
8965 bool
8966 vect_can_vectorize_without_simd_p (tree_code code)
8967 {
8968   switch (code)
8969     {
8970     case PLUS_EXPR:
8971     case MINUS_EXPR:
8972     case NEGATE_EXPR:
8973     case BIT_AND_EXPR:
8974     case BIT_IOR_EXPR:
8975     case BIT_XOR_EXPR:
8976     case BIT_NOT_EXPR:
8977       return true;
8978
8979     default:
8980       return false;
8981     }
8982 }
8983
8984 /* Likewise, but taking a code_helper.  */
8985
8986 bool
8987 vect_can_vectorize_without_simd_p (code_helper code)
8988 {
8989   return (code.is_tree_code ()
8990           && vect_can_vectorize_without_simd_p (tree_code (code)));
8991 }
8992
8993 /* Create vector init for vectorized iv.  */
8994 static tree
8995 vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
8996                                tree step_expr, poly_uint64 nunits,
8997                                tree vectype,
8998                                enum vect_induction_op_type induction_type)
8999 {
9000   unsigned HOST_WIDE_INT const_nunits;
9001   tree vec_shift, vec_init, new_name;
9002   unsigned i;
9003   tree itype = TREE_TYPE (vectype);
9004
9005   /* iv_loop is the loop to be vectorized. Create:
9006      vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr).  */
9007   new_name = gimple_convert (stmts, itype, init_expr);
9008   switch (induction_type)
9009     {
9010     case vect_step_op_shr:
9011     case vect_step_op_shl:
9012       /* Build the Initial value from shift_expr.  */
9013       vec_init = gimple_build_vector_from_val (stmts,
9014                                                vectype,
9015                                                new_name);
9016       vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
9017                                 build_zero_cst (itype), step_expr);
9018       vec_init = gimple_build (stmts,
9019                                (induction_type == vect_step_op_shr
9020                                 ? RSHIFT_EXPR : LSHIFT_EXPR),
9021                                vectype, vec_init, vec_shift);
9022       break;
9023
9024     case vect_step_op_neg:
9025       {
9026         vec_init = gimple_build_vector_from_val (stmts,
9027                                                  vectype,
9028                                                  new_name);
9029         tree vec_neg = gimple_build (stmts, NEGATE_EXPR,
9030                                      vectype, vec_init);
9031         /* The encoding has 2 interleaved stepped patterns.  */
9032         vec_perm_builder sel (nunits, 2, 3);
9033         sel.quick_grow (6);
9034         for (i = 0; i < 3; i++)
9035           {
9036             sel[2 * i] = i;
9037             sel[2 * i + 1] = i + nunits;
9038           }
9039         vec_perm_indices indices (sel, 2, nunits);
9040         /* Don't use vect_gen_perm_mask_checked since can_vec_perm_const_p may
9041            fail when vec_init is const vector. In that situation vec_perm is not
9042            really needed.  */
9043         tree perm_mask_even
9044           = vect_gen_perm_mask_any (vectype, indices);
9045         vec_init = gimple_build (stmts, VEC_PERM_EXPR,
9046                                  vectype,
9047                                  vec_init, vec_neg,
9048                                  perm_mask_even);
9049       }
9050       break;
9051
9052     case vect_step_op_mul:
9053       {
9054         /* Use unsigned mult to avoid UD integer overflow.  */
9055         gcc_assert (nunits.is_constant (&const_nunits));
9056         tree utype = unsigned_type_for (itype);
9057         tree uvectype = build_vector_type (utype,
9058                                            TYPE_VECTOR_SUBPARTS (vectype));
9059         new_name = gimple_convert (stmts, utype, new_name);
9060         vec_init = gimple_build_vector_from_val (stmts,
9061                                                  uvectype,
9062                                                  new_name);
9063         tree_vector_builder elts (uvectype, const_nunits, 1);
9064         tree elt_step = build_one_cst (utype);
9065
9066         elts.quick_push (elt_step);
9067         for (i = 1; i < const_nunits; i++)
9068           {
9069             /* Create: new_name_i = new_name + step_expr.  */
9070             elt_step = gimple_build (stmts, MULT_EXPR,
9071                                      utype, elt_step, step_expr);
9072             elts.quick_push (elt_step);
9073           }
9074         /* Create a vector from [new_name_0, new_name_1, ...,
9075            new_name_nunits-1].  */
9076         tree vec_mul = gimple_build_vector (stmts, &elts);
9077         vec_init = gimple_build (stmts, MULT_EXPR, uvectype,
9078                                  vec_init, vec_mul);
9079         vec_init = gimple_convert (stmts, vectype, vec_init);
9080       }
9081       break;
9082
9083     default:
9084       gcc_unreachable ();
9085     }
9086
9087   return vec_init;
9088 }
9089
9090 /* Peel init_expr by skip_niter for induction_type.  */
9091 tree
9092 vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9093                              tree skip_niters, tree step_expr,
9094                              enum vect_induction_op_type induction_type)
9095 {
9096   gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST);
9097   tree type = TREE_TYPE (init_expr);
9098   unsigned prec = TYPE_PRECISION (type);
9099   switch (induction_type)
9100     {
9101     case vect_step_op_neg:
9102       if (TREE_INT_CST_LOW (skip_niters) % 2)
9103         init_expr = gimple_build (stmts, NEGATE_EXPR, type, init_expr);
9104       /* else no change.  */
9105       break;
9106
9107     case vect_step_op_shr:
9108     case vect_step_op_shl:
9109       skip_niters = gimple_convert (stmts, type, skip_niters);
9110       step_expr = gimple_build (stmts, MULT_EXPR, type, step_expr, skip_niters);
9111       /* When shift mount >= precision, need to avoid UD.
9112          In the original loop, there's no UD, and according to semantic,
9113          init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr.  */
9114       if (!tree_fits_uhwi_p (step_expr)
9115           || tree_to_uhwi (step_expr) >= prec)
9116         {
9117           if (induction_type == vect_step_op_shl
9118               || TYPE_UNSIGNED (type))
9119             init_expr = build_zero_cst (type);
9120           else
9121             init_expr = gimple_build (stmts, RSHIFT_EXPR, type,
9122                                       init_expr,
9123                                       wide_int_to_tree (type, prec - 1));
9124         }
9125       else
9126         init_expr = gimple_build (stmts, (induction_type == vect_step_op_shr
9127                                           ? RSHIFT_EXPR : LSHIFT_EXPR),
9128                                   type, init_expr, step_expr);
9129       break;
9130
9131     case vect_step_op_mul:
9132       {
9133         tree utype = unsigned_type_for (type);
9134         init_expr = gimple_convert (stmts, utype, init_expr);
9135         unsigned skipn = TREE_INT_CST_LOW (skip_niters);
9136         wide_int begin = wi::to_wide (step_expr);
9137         for (unsigned i = 0; i != skipn - 1; i++)
9138           begin = wi::mul (begin, wi::to_wide (step_expr));
9139         tree mult_expr = wide_int_to_tree (utype, begin);
9140         init_expr = gimple_build (stmts, MULT_EXPR, utype, init_expr, mult_expr);
9141         init_expr = gimple_convert (stmts, type, init_expr);
9142       }
9143       break;
9144
9145     default:
9146       gcc_unreachable ();
9147     }
9148
9149   return init_expr;
9150 }
9151
9152 /* Create vector step for vectorized iv.  */
9153 static tree
9154 vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
9155                                poly_uint64 vf,
9156                                enum vect_induction_op_type induction_type)
9157 {
9158   tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
9159   tree new_name = NULL;
9160   /* Step should be pow (step, vf) for mult induction.  */
9161   if (induction_type == vect_step_op_mul)
9162     {
9163       gcc_assert (vf.is_constant ());
9164       wide_int begin = wi::to_wide (step_expr);
9165
9166       for (unsigned i = 0; i != vf.to_constant () - 1; i++)
9167         begin = wi::mul (begin, wi::to_wide (step_expr));
9168
9169       new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin);
9170     }
9171   else if (induction_type == vect_step_op_neg)
9172     /* Do nothing.  */
9173     ;
9174   else
9175     new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr),
9176                              expr, step_expr);
9177   return new_name;
9178 }
9179
9180 static tree
9181 vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
9182                                    stmt_vec_info stmt_info,
9183                                    tree new_name, tree vectype,
9184                                    enum vect_induction_op_type induction_type)
9185 {
9186   /* No step is needed for neg induction.  */
9187   if (induction_type == vect_step_op_neg)
9188     return NULL;
9189
9190   tree t = unshare_expr (new_name);
9191   gcc_assert (CONSTANT_CLASS_P (new_name)
9192               || TREE_CODE (new_name) == SSA_NAME);
9193   tree new_vec = build_vector_from_val (vectype, t);
9194   tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
9195                                     new_vec, vectype, NULL);
9196   return vec_step;
9197 }
9198
9199 /* Update vectorized iv with vect_step, induc_def is init.  */
9200 static tree
9201 vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
9202                           tree induc_def, tree vec_step,
9203                           enum vect_induction_op_type induction_type)
9204 {
9205   tree vec_def = induc_def;
9206   switch (induction_type)
9207     {
9208     case vect_step_op_mul:
9209       {
9210         /* Use unsigned mult to avoid UD integer overflow.  */
9211         tree uvectype
9212           = build_vector_type (unsigned_type_for (TREE_TYPE (vectype)),
9213                                TYPE_VECTOR_SUBPARTS (vectype));
9214         vec_def = gimple_convert (stmts, uvectype, vec_def);
9215         vec_step = gimple_convert (stmts, uvectype, vec_step);
9216         vec_def = gimple_build (stmts, MULT_EXPR, uvectype,
9217                                 vec_def, vec_step);
9218         vec_def = gimple_convert (stmts, vectype, vec_def);
9219       }
9220       break;
9221
9222     case vect_step_op_shr:
9223       vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype,
9224                               vec_def, vec_step);
9225       break;
9226
9227     case vect_step_op_shl:
9228       vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype,
9229                               vec_def, vec_step);
9230       break;
9231     case vect_step_op_neg:
9232       vec_def = induc_def;
9233       /* Do nothing.  */
9234       break;
9235     default:
9236       gcc_unreachable ();
9237     }
9238
9239   return vec_def;
9240
9241 }
9242
9243 /* Function vectorizable_induction
9244
9245    Check if STMT_INFO performs an nonlinear induction computation that can be
9246    vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
9247    a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
9248    basic block.
9249    Return true if STMT_INFO is vectorizable in this way.  */
9250
9251 static bool
9252 vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
9253                                   stmt_vec_info stmt_info,
9254                                   gimple **vec_stmt, slp_tree slp_node,
9255                                   stmt_vector_for_cost *cost_vec)
9256 {
9257   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9258   unsigned ncopies;
9259   bool nested_in_vect_loop = false;
9260   class loop *iv_loop;
9261   tree vec_def;
9262   edge pe = loop_preheader_edge (loop);
9263   basic_block new_bb;
9264   tree vec_init, vec_step;
9265   tree new_name;
9266   gimple *new_stmt;
9267   gphi *induction_phi;
9268   tree induc_def, vec_dest;
9269   tree init_expr, step_expr;
9270   tree niters_skip;
9271   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9272   unsigned i;
9273   gimple_stmt_iterator si;
9274
9275   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9276
9277   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9278   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9279   enum vect_induction_op_type induction_type
9280     = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9281
9282   gcc_assert (induction_type > vect_step_op_add);
9283
9284   if (slp_node)
9285     ncopies = 1;
9286   else
9287     ncopies = vect_get_num_copies (loop_vinfo, vectype);
9288   gcc_assert (ncopies >= 1);
9289
9290   /* FORNOW. Only handle nonlinear induction in the same loop.  */
9291   if (nested_in_vect_loop_p (loop, stmt_info))
9292     {
9293       if (dump_enabled_p ())
9294         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9295                          "nonlinear induction in nested loop.\n");
9296       return false;
9297     }
9298
9299   iv_loop = loop;
9300   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9301
9302   /* TODO: Support slp for nonlinear iv. There should be separate vector iv
9303      update for each iv and a permutation to generate wanted vector iv.  */
9304   if (slp_node)
9305     {
9306       if (dump_enabled_p ())
9307         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9308                          "SLP induction not supported for nonlinear"
9309                          " induction.\n");
9310       return false;
9311     }
9312
9313   if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
9314     {
9315       if (dump_enabled_p ())
9316         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9317                          "floating point nonlinear induction vectorization"
9318                          " not supported.\n");
9319       return false;
9320     }
9321
9322   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9323   init_expr = vect_phi_initial_value (phi);
9324   gcc_assert (step_expr != NULL_TREE && init_expr != NULL
9325               && TREE_CODE (step_expr) == INTEGER_CST);
9326   /* step_expr should be aligned with init_expr,
9327      .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used.  */
9328   step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
9329
9330   if (TREE_CODE (init_expr) == INTEGER_CST)
9331     init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
9332   else
9333     gcc_assert (tree_nop_conversion_p (TREE_TYPE (vectype),
9334                                        TREE_TYPE (init_expr)));
9335
9336   switch (induction_type)
9337     {
9338     case vect_step_op_neg:
9339       if (TREE_CODE (init_expr) != INTEGER_CST
9340           && TREE_CODE (init_expr) != REAL_CST)
9341         {
9342           /* Check for backend support of NEGATE_EXPR and vec_perm.  */
9343           if (!directly_supported_p (NEGATE_EXPR, vectype))
9344             return false;
9345
9346           /* The encoding has 2 interleaved stepped patterns.  */
9347           vec_perm_builder sel (nunits, 2, 3);
9348           machine_mode mode = TYPE_MODE (vectype);
9349           sel.quick_grow (6);
9350           for (i = 0; i < 3; i++)
9351             {
9352               sel[i * 2] = i;
9353               sel[i * 2 + 1] = i + nunits;
9354             }
9355           vec_perm_indices indices (sel, 2, nunits);
9356           if (!can_vec_perm_const_p (mode, mode, indices))
9357             return false;
9358         }
9359       break;
9360
9361     case vect_step_op_mul:
9362       {
9363         /* Check for backend support of MULT_EXPR.  */
9364         if (!directly_supported_p (MULT_EXPR, vectype))
9365           return false;
9366
9367         /* ?? How to construct vector step for variable number vector.
9368            [ 1, step, pow (step, 2), pow (step, 4), .. ].  */
9369         if (!vf.is_constant ())
9370           return false;
9371       }
9372       break;
9373
9374     case vect_step_op_shr:
9375       /* Check for backend support of RSHIFT_EXPR.  */
9376       if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
9377         return false;
9378
9379       /* Don't shift more than type precision to avoid UD.  */
9380       if (!tree_fits_uhwi_p (step_expr)
9381           || maybe_ge (nunits * tree_to_uhwi (step_expr),
9382                        TYPE_PRECISION (TREE_TYPE (init_expr))))
9383         return false;
9384       break;
9385
9386     case vect_step_op_shl:
9387       /* Check for backend support of RSHIFT_EXPR.  */
9388       if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
9389         return false;
9390
9391       /* Don't shift more than type precision to avoid UD.  */
9392       if (!tree_fits_uhwi_p (step_expr)
9393           || maybe_ge (nunits * tree_to_uhwi (step_expr),
9394                        TYPE_PRECISION (TREE_TYPE (init_expr))))
9395         return false;
9396
9397       break;
9398
9399     default:
9400       gcc_unreachable ();
9401     }
9402
9403   if (!vec_stmt) /* transformation not required.  */
9404     {
9405       unsigned inside_cost = 0, prologue_cost = 0;
9406       /* loop cost for vec_loop. Neg induction doesn't have any
9407          inside_cost.  */
9408       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9409                                       stmt_info, 0, vect_body);
9410
9411       /* loop cost for vec_loop. Neg induction doesn't have any
9412          inside_cost.  */
9413       if (induction_type == vect_step_op_neg)
9414         inside_cost = 0;
9415
9416       /* prologue cost for vec_init and vec_step.  */
9417       prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9418                                         stmt_info, 0, vect_prologue);
9419
9420       if (dump_enabled_p ())
9421         dump_printf_loc (MSG_NOTE, vect_location,
9422                          "vect_model_induction_cost: inside_cost = %d, "
9423                          "prologue_cost = %d. \n", inside_cost,
9424                          prologue_cost);
9425
9426       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9427       DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
9428       return true;
9429     }
9430
9431   /* Transform.  */
9432
9433   /* Compute a vector variable, initialized with the first VF values of
9434      the induction variable.  E.g., for an iv with IV_PHI='X' and
9435      evolution S, for a vector of 4 units, we want to compute:
9436      [X, X + S, X + 2*S, X + 3*S].  */
9437
9438   if (dump_enabled_p ())
9439     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9440
9441   pe = loop_preheader_edge (iv_loop);
9442   /* Find the first insertion point in the BB.  */
9443   basic_block bb = gimple_bb (phi);
9444   si = gsi_after_labels (bb);
9445
9446   gimple_seq stmts = NULL;
9447
9448   niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9449   /* If we are using the loop mask to "peel" for alignment then we need
9450      to adjust the start value here.  */
9451   if (niters_skip != NULL_TREE)
9452     init_expr = vect_peel_nonlinear_iv_init (&stmts, init_expr, niters_skip,
9453                                              step_expr, induction_type);
9454
9455   vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr,
9456                                             step_expr, nunits, vectype,
9457                                             induction_type);
9458   if (stmts)
9459     {
9460       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9461       gcc_assert (!new_bb);
9462     }
9463
9464   stmts = NULL;
9465   new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9466                                             vf, induction_type);
9467   if (stmts)
9468     {
9469       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9470       gcc_assert (!new_bb);
9471     }
9472
9473   vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9474                                                 new_name, vectype,
9475                                                 induction_type);
9476   /* Create the following def-use cycle:
9477      loop prolog:
9478      vec_init = ...
9479      vec_step = ...
9480      loop:
9481      vec_iv = PHI <vec_init, vec_loop>
9482      ...
9483      STMT
9484      ...
9485      vec_loop = vec_iv + vec_step;  */
9486
9487   /* Create the induction-phi that defines the induction-operand.  */
9488   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
9489   induction_phi = create_phi_node (vec_dest, iv_loop->header);
9490   induc_def = PHI_RESULT (induction_phi);
9491
9492   /* Create the iv update inside the loop.  */
9493   stmts = NULL;
9494   vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9495                                       induc_def, vec_step,
9496                                       induction_type);
9497
9498   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9499   new_stmt = SSA_NAME_DEF_STMT (vec_def);
9500
9501   /* Set the arguments of the phi node:  */
9502   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9503   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9504                UNKNOWN_LOCATION);
9505
9506   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
9507   *vec_stmt = induction_phi;
9508
9509   /* In case that vectorization factor (VF) is bigger than the number
9510      of elements that we can fit in a vectype (nunits), we have to generate
9511      more than one vector stmt - i.e - we need to "unroll" the
9512      vector stmt by a factor VF/nunits.  For more details see documentation
9513      in vectorizable_operation.  */
9514
9515   if (ncopies > 1)
9516     {
9517       stmts = NULL;
9518       /* FORNOW. This restriction should be relaxed.  */
9519       gcc_assert (!nested_in_vect_loop);
9520
9521       new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9522                                                 nunits, induction_type);
9523
9524       vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9525                                                     new_name, vectype,
9526                                                     induction_type);
9527       vec_def = induc_def;
9528       for (i = 1; i < ncopies; i++)
9529         {
9530           /* vec_i = vec_prev + vec_step.  */
9531           stmts = NULL;
9532           vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9533                                               vec_def, vec_step,
9534                                               induction_type);
9535           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9536           new_stmt = SSA_NAME_DEF_STMT (vec_def);
9537           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
9538         }
9539     }
9540
9541   if (dump_enabled_p ())
9542     dump_printf_loc (MSG_NOTE, vect_location,
9543                      "transform induction: created def-use cycle: %G%G",
9544                      (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
9545
9546   return true;
9547 }
9548
9549 /* Function vectorizable_induction
9550
9551    Check if STMT_INFO performs an induction computation that can be vectorized.
9552    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
9553    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
9554    Return true if STMT_INFO is vectorizable in this way.  */
9555
9556 bool
9557 vectorizable_induction (loop_vec_info loop_vinfo,
9558                         stmt_vec_info stmt_info,
9559                         gimple **vec_stmt, slp_tree slp_node,
9560                         stmt_vector_for_cost *cost_vec)
9561 {
9562   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9563   unsigned ncopies;
9564   bool nested_in_vect_loop = false;
9565   class loop *iv_loop;
9566   tree vec_def;
9567   edge pe = loop_preheader_edge (loop);
9568   basic_block new_bb;
9569   tree new_vec, vec_init, vec_step, t;
9570   tree new_name;
9571   gimple *new_stmt;
9572   gphi *induction_phi;
9573   tree induc_def, vec_dest;
9574   tree init_expr, step_expr;
9575   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9576   unsigned i;
9577   tree expr;
9578   gimple_stmt_iterator si;
9579   enum vect_induction_op_type induction_type
9580     = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9581
9582   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9583   if (!phi)
9584     return false;
9585
9586   if (!STMT_VINFO_RELEVANT_P (stmt_info))
9587     return false;
9588
9589   /* Make sure it was recognized as induction computation.  */
9590   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
9591     return false;
9592
9593   /* Handle nonlinear induction in a separate place.  */
9594   if (induction_type != vect_step_op_add)
9595     return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
9596                                              vec_stmt, slp_node, cost_vec);
9597
9598   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9599   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9600
9601   if (slp_node)
9602     ncopies = 1;
9603   else
9604     ncopies = vect_get_num_copies (loop_vinfo, vectype);
9605   gcc_assert (ncopies >= 1);
9606
9607   /* FORNOW. These restrictions should be relaxed.  */
9608   if (nested_in_vect_loop_p (loop, stmt_info))
9609     {
9610       imm_use_iterator imm_iter;
9611       use_operand_p use_p;
9612       gimple *exit_phi;
9613       edge latch_e;
9614       tree loop_arg;
9615
9616       if (ncopies > 1)
9617         {
9618           if (dump_enabled_p ())
9619             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9620                              "multiple types in nested loop.\n");
9621           return false;
9622         }
9623
9624       exit_phi = NULL;
9625       latch_e = loop_latch_edge (loop->inner);
9626       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
9627       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
9628         {
9629           gimple *use_stmt = USE_STMT (use_p);
9630           if (is_gimple_debug (use_stmt))
9631             continue;
9632
9633           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
9634             {
9635               exit_phi = use_stmt;
9636               break;
9637             }
9638         }
9639       if (exit_phi)
9640         {
9641           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
9642           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
9643                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
9644             {
9645               if (dump_enabled_p ())
9646                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9647                                  "inner-loop induction only used outside "
9648                                  "of the outer vectorized loop.\n");
9649               return false;
9650             }
9651         }
9652
9653       nested_in_vect_loop = true;
9654       iv_loop = loop->inner;
9655     }
9656   else
9657     iv_loop = loop;
9658   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9659
9660   if (slp_node && !nunits.is_constant ())
9661     {
9662       /* The current SLP code creates the step value element-by-element.  */
9663       if (dump_enabled_p ())
9664         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9665                          "SLP induction not supported for variable-length"
9666                          " vectors.\n");
9667       return false;
9668     }
9669
9670   if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
9671     {
9672       if (dump_enabled_p ())
9673         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9674                          "floating point induction vectorization disabled\n");
9675       return false;
9676     }
9677
9678   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9679   gcc_assert (step_expr != NULL_TREE);
9680   tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
9681
9682   /* Check for backend support of PLUS/MINUS_EXPR. */
9683   if (!directly_supported_p (PLUS_EXPR, step_vectype)
9684       || !directly_supported_p (MINUS_EXPR, step_vectype))
9685     return false;
9686
9687   if (!vec_stmt) /* transformation not required.  */
9688     {
9689       unsigned inside_cost = 0, prologue_cost = 0;
9690       if (slp_node)
9691         {
9692           /* We eventually need to set a vector type on invariant
9693              arguments.  */
9694           unsigned j;
9695           slp_tree child;
9696           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9697             if (!vect_maybe_update_slp_op_vectype
9698                 (child, SLP_TREE_VECTYPE (slp_node)))
9699               {
9700                 if (dump_enabled_p ())
9701                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9702                                    "incompatible vector types for "
9703                                    "invariants\n");
9704                 return false;
9705               }
9706           /* loop cost for vec_loop.  */
9707           inside_cost
9708             = record_stmt_cost (cost_vec,
9709                                 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
9710                                 vector_stmt, stmt_info, 0, vect_body);
9711           /* prologue cost for vec_init (if not nested) and step.  */
9712           prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
9713                                             scalar_to_vec,
9714                                             stmt_info, 0, vect_prologue);
9715         }
9716       else /* if (!slp_node) */
9717         {
9718           /* loop cost for vec_loop.  */
9719           inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9720                                           stmt_info, 0, vect_body);
9721           /* prologue cost for vec_init and vec_step.  */
9722           prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9723                                             stmt_info, 0, vect_prologue);
9724         }
9725       if (dump_enabled_p ())
9726         dump_printf_loc (MSG_NOTE, vect_location,
9727                          "vect_model_induction_cost: inside_cost = %d, "
9728                          "prologue_cost = %d .\n", inside_cost,
9729                          prologue_cost);
9730
9731       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9732       DUMP_VECT_SCOPE ("vectorizable_induction");
9733       return true;
9734     }
9735
9736   /* Transform.  */
9737
9738   /* Compute a vector variable, initialized with the first VF values of
9739      the induction variable.  E.g., for an iv with IV_PHI='X' and
9740      evolution S, for a vector of 4 units, we want to compute:
9741      [X, X + S, X + 2*S, X + 3*S].  */
9742
9743   if (dump_enabled_p ())
9744     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9745
9746   pe = loop_preheader_edge (iv_loop);
9747   /* Find the first insertion point in the BB.  */
9748   basic_block bb = gimple_bb (phi);
9749   si = gsi_after_labels (bb);
9750
9751   /* For SLP induction we have to generate several IVs as for example
9752      with group size 3 we need
9753        [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
9754        [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2].  */
9755   if (slp_node)
9756     {
9757       /* Enforced above.  */
9758       unsigned int const_nunits = nunits.to_constant ();
9759
9760       /* The initial values are vectorized, but any lanes > group_size
9761          need adjustment.  */
9762       slp_tree init_node
9763         = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
9764
9765       /* Gather steps.  Since we do not vectorize inductions as
9766          cycles we have to reconstruct the step from SCEV data.  */
9767       unsigned group_size = SLP_TREE_LANES (slp_node);
9768       tree *steps = XALLOCAVEC (tree, group_size);
9769       tree *inits = XALLOCAVEC (tree, group_size);
9770       stmt_vec_info phi_info;
9771       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
9772         {
9773           steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
9774           if (!init_node)
9775             inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
9776                                            pe->dest_idx);
9777         }
9778
9779       /* Now generate the IVs.  */
9780       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
9781       gcc_assert ((const_nunits * nvects) % group_size == 0);
9782       unsigned nivs;
9783       if (nested_in_vect_loop)
9784         nivs = nvects;
9785       else
9786         {
9787           /* Compute the number of distinct IVs we need.  First reduce
9788              group_size if it is a multiple of const_nunits so we get
9789              one IV for a group_size of 4 but const_nunits 2.  */
9790           unsigned group_sizep = group_size;
9791           if (group_sizep % const_nunits == 0)
9792             group_sizep = group_sizep / const_nunits;
9793           nivs = least_common_multiple (group_sizep,
9794                                         const_nunits) / const_nunits;
9795         }
9796       tree stept = TREE_TYPE (step_vectype);
9797       tree lupdate_mul = NULL_TREE;
9798       if (!nested_in_vect_loop)
9799         {
9800           /* The number of iterations covered in one vector iteration.  */
9801           unsigned lup_mul = (nvects * const_nunits) / group_size;
9802           lupdate_mul
9803             = build_vector_from_val (step_vectype,
9804                                      SCALAR_FLOAT_TYPE_P (stept)
9805                                      ? build_real_from_wide (stept, lup_mul,
9806                                                              UNSIGNED)
9807                                      : build_int_cstu (stept, lup_mul));
9808         }
9809       tree peel_mul = NULL_TREE;
9810       gimple_seq init_stmts = NULL;
9811       if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
9812         {
9813           if (SCALAR_FLOAT_TYPE_P (stept))
9814             peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
9815                                      LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
9816           else
9817             peel_mul = gimple_convert (&init_stmts, stept,
9818                                        LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
9819           peel_mul = gimple_build_vector_from_val (&init_stmts,
9820                                                    step_vectype, peel_mul);
9821         }
9822       unsigned ivn;
9823       auto_vec<tree> vec_steps;
9824       for (ivn = 0; ivn < nivs; ++ivn)
9825         {
9826           tree_vector_builder step_elts (step_vectype, const_nunits, 1);
9827           tree_vector_builder init_elts (vectype, const_nunits, 1);
9828           tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
9829           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
9830             {
9831               /* The scalar steps of the IVs.  */
9832               tree elt = steps[(ivn*const_nunits + eltn) % group_size];
9833               elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
9834               step_elts.quick_push (elt);
9835               if (!init_node)
9836                 {
9837                   /* The scalar inits of the IVs if not vectorized.  */
9838                   elt = inits[(ivn*const_nunits + eltn) % group_size];
9839                   if (!useless_type_conversion_p (TREE_TYPE (vectype),
9840                                                   TREE_TYPE (elt)))
9841                     elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
9842                                         TREE_TYPE (vectype), elt);
9843                   init_elts.quick_push (elt);
9844                 }
9845               /* The number of steps to add to the initial values.  */
9846               unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
9847               mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
9848                                    ? build_real_from_wide (stept,
9849                                                            mul_elt, UNSIGNED)
9850                                    : build_int_cstu (stept, mul_elt));
9851             }
9852           vec_step = gimple_build_vector (&init_stmts, &step_elts);
9853           vec_steps.safe_push (vec_step);
9854           tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
9855           if (peel_mul)
9856             step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
9857                                      step_mul, peel_mul);
9858           if (!init_node)
9859             vec_init = gimple_build_vector (&init_stmts, &init_elts);
9860
9861           /* Create the induction-phi that defines the induction-operand.  */
9862           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
9863                                             "vec_iv_");
9864           induction_phi = create_phi_node (vec_dest, iv_loop->header);
9865           induc_def = PHI_RESULT (induction_phi);
9866
9867           /* Create the iv update inside the loop  */
9868           tree up = vec_step;
9869           if (lupdate_mul)
9870             up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9871                                vec_step, lupdate_mul);
9872           gimple_seq stmts = NULL;
9873           vec_def = gimple_convert (&stmts, step_vectype, induc_def);
9874           vec_def = gimple_build (&stmts,
9875                                   PLUS_EXPR, step_vectype, vec_def, up);
9876           vec_def = gimple_convert (&stmts, vectype, vec_def);
9877           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9878           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9879                        UNKNOWN_LOCATION);
9880
9881           if (init_node)
9882             vec_init = vect_get_slp_vect_def (init_node, ivn);
9883           if (!nested_in_vect_loop
9884               && !integer_zerop (step_mul))
9885             {
9886               vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
9887               up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9888                                  vec_step, step_mul);
9889               vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
9890                                       vec_def, up);
9891               vec_init = gimple_convert (&init_stmts, vectype, vec_def);
9892             }
9893
9894           /* Set the arguments of the phi node:  */
9895           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9896
9897           slp_node->push_vec_def (induction_phi);
9898         }
9899       if (!nested_in_vect_loop)
9900         {
9901           /* Fill up to the number of vectors we need for the whole group.  */
9902           nivs = least_common_multiple (group_size,
9903                                         const_nunits) / const_nunits;
9904           vec_steps.reserve (nivs-ivn);
9905           for (; ivn < nivs; ++ivn)
9906             {
9907               slp_node->push_vec_def (SLP_TREE_VEC_DEFS (slp_node)[0]);
9908               vec_steps.quick_push (vec_steps[0]);
9909             }
9910         }
9911
9912       /* Re-use IVs when we can.  We are generating further vector
9913          stmts by adding VF' * stride to the IVs generated above.  */
9914       if (ivn < nvects)
9915         {
9916           unsigned vfp
9917             = least_common_multiple (group_size, const_nunits) / group_size;
9918           tree lupdate_mul
9919             = build_vector_from_val (step_vectype,
9920                                      SCALAR_FLOAT_TYPE_P (stept)
9921                                      ? build_real_from_wide (stept,
9922                                                              vfp, UNSIGNED)
9923                                      : build_int_cstu (stept, vfp));
9924           for (; ivn < nvects; ++ivn)
9925             {
9926               gimple *iv
9927                 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (slp_node)[ivn - nivs]);
9928               tree def = gimple_get_lhs (iv);
9929               if (ivn < 2*nivs)
9930                 vec_steps[ivn - nivs]
9931                   = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9932                                   vec_steps[ivn - nivs], lupdate_mul);
9933               gimple_seq stmts = NULL;
9934               def = gimple_convert (&stmts, step_vectype, def);
9935               def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
9936                                   def, vec_steps[ivn % nivs]);
9937               def = gimple_convert (&stmts, vectype, def);
9938               if (gimple_code (iv) == GIMPLE_PHI)
9939                 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9940               else
9941                 {
9942                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
9943                   gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
9944                 }
9945               slp_node->push_vec_def (def);
9946             }
9947         }
9948
9949       new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
9950       gcc_assert (!new_bb);
9951
9952       return true;
9953     }
9954
9955   init_expr = vect_phi_initial_value (phi);
9956
9957   gimple_seq stmts = NULL;
9958   if (!nested_in_vect_loop)
9959     {
9960       /* Convert the initial value to the IV update type.  */
9961       tree new_type = TREE_TYPE (step_expr);
9962       init_expr = gimple_convert (&stmts, new_type, init_expr);
9963
9964       /* If we are using the loop mask to "peel" for alignment then we need
9965          to adjust the start value here.  */
9966       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9967       if (skip_niters != NULL_TREE)
9968         {
9969           if (FLOAT_TYPE_P (vectype))
9970             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
9971                                         skip_niters);
9972           else
9973             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
9974           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
9975                                          skip_niters, step_expr);
9976           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
9977                                     init_expr, skip_step);
9978         }
9979     }
9980
9981   if (stmts)
9982     {
9983       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9984       gcc_assert (!new_bb);
9985     }
9986
9987   /* Create the vector that holds the initial_value of the induction.  */
9988   if (nested_in_vect_loop)
9989     {
9990       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
9991          been created during vectorization of previous stmts.  We obtain it
9992          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
9993       auto_vec<tree> vec_inits;
9994       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
9995                                      init_expr, &vec_inits);
9996       vec_init = vec_inits[0];
9997       /* If the initial value is not of proper type, convert it.  */
9998       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
9999         {
10000           new_stmt
10001             = gimple_build_assign (vect_get_new_ssa_name (vectype,
10002                                                           vect_simple_var,
10003                                                           "vec_iv_"),
10004                                    VIEW_CONVERT_EXPR,
10005                                    build1 (VIEW_CONVERT_EXPR, vectype,
10006                                            vec_init));
10007           vec_init = gimple_assign_lhs (new_stmt);
10008           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
10009                                                  new_stmt);
10010           gcc_assert (!new_bb);
10011         }
10012     }
10013   else
10014     {
10015       /* iv_loop is the loop to be vectorized. Create:
10016          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
10017       stmts = NULL;
10018       new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
10019
10020       unsigned HOST_WIDE_INT const_nunits;
10021       if (nunits.is_constant (&const_nunits))
10022         {
10023           tree_vector_builder elts (step_vectype, const_nunits, 1);
10024           elts.quick_push (new_name);
10025           for (i = 1; i < const_nunits; i++)
10026             {
10027               /* Create: new_name_i = new_name + step_expr  */
10028               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
10029                                        new_name, step_expr);
10030               elts.quick_push (new_name);
10031             }
10032           /* Create a vector from [new_name_0, new_name_1, ...,
10033              new_name_nunits-1]  */
10034           vec_init = gimple_build_vector (&stmts, &elts);
10035         }
10036       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
10037         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
10038         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
10039                                  new_name, step_expr);
10040       else
10041         {
10042           /* Build:
10043                 [base, base, base, ...]
10044                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
10045           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
10046           gcc_assert (flag_associative_math);
10047           tree index = build_index_vector (step_vectype, 0, 1);
10048           tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10049                                                         new_name);
10050           tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10051                                                         step_expr);
10052           vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
10053           vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
10054                                    vec_init, step_vec);
10055           vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10056                                    vec_init, base_vec);
10057         }
10058       vec_init = gimple_convert (&stmts, vectype, vec_init);
10059
10060       if (stmts)
10061         {
10062           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10063           gcc_assert (!new_bb);
10064         }
10065     }
10066
10067
10068   /* Create the vector that holds the step of the induction.  */
10069   if (nested_in_vect_loop)
10070     /* iv_loop is nested in the loop to be vectorized. Generate:
10071        vec_step = [S, S, S, S]  */
10072     new_name = step_expr;
10073   else
10074     {
10075       /* iv_loop is the loop to be vectorized. Generate:
10076           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
10077       gimple_seq seq = NULL;
10078       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10079         {
10080           expr = build_int_cst (integer_type_node, vf);
10081           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10082         }
10083       else
10084         expr = build_int_cst (TREE_TYPE (step_expr), vf);
10085       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10086                                expr, step_expr);
10087       if (seq)
10088         {
10089           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10090           gcc_assert (!new_bb);
10091         }
10092     }
10093
10094   t = unshare_expr (new_name);
10095   gcc_assert (CONSTANT_CLASS_P (new_name)
10096               || TREE_CODE (new_name) == SSA_NAME);
10097   new_vec = build_vector_from_val (step_vectype, t);
10098   vec_step = vect_init_vector (loop_vinfo, stmt_info,
10099                                new_vec, step_vectype, NULL);
10100
10101
10102   /* Create the following def-use cycle:
10103      loop prolog:
10104          vec_init = ...
10105          vec_step = ...
10106      loop:
10107          vec_iv = PHI <vec_init, vec_loop>
10108          ...
10109          STMT
10110          ...
10111          vec_loop = vec_iv + vec_step;  */
10112
10113   /* Create the induction-phi that defines the induction-operand.  */
10114   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
10115   induction_phi = create_phi_node (vec_dest, iv_loop->header);
10116   induc_def = PHI_RESULT (induction_phi);
10117
10118   /* Create the iv update inside the loop  */
10119   stmts = NULL;
10120   vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10121   vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
10122   vec_def = gimple_convert (&stmts, vectype, vec_def);
10123   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10124   new_stmt = SSA_NAME_DEF_STMT (vec_def);
10125
10126   /* Set the arguments of the phi node:  */
10127   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10128   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10129                UNKNOWN_LOCATION);
10130
10131   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
10132   *vec_stmt = induction_phi;
10133
10134   /* In case that vectorization factor (VF) is bigger than the number
10135      of elements that we can fit in a vectype (nunits), we have to generate
10136      more than one vector stmt - i.e - we need to "unroll" the
10137      vector stmt by a factor VF/nunits.  For more details see documentation
10138      in vectorizable_operation.  */
10139
10140   if (ncopies > 1)
10141     {
10142       gimple_seq seq = NULL;
10143       /* FORNOW. This restriction should be relaxed.  */
10144       gcc_assert (!nested_in_vect_loop);
10145
10146       /* Create the vector that holds the step of the induction.  */
10147       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10148         {
10149           expr = build_int_cst (integer_type_node, nunits);
10150           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10151         }
10152       else
10153         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
10154       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10155                                expr, step_expr);
10156       if (seq)
10157         {
10158           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10159           gcc_assert (!new_bb);
10160         }
10161
10162       t = unshare_expr (new_name);
10163       gcc_assert (CONSTANT_CLASS_P (new_name)
10164                   || TREE_CODE (new_name) == SSA_NAME);
10165       new_vec = build_vector_from_val (step_vectype, t);
10166       vec_step = vect_init_vector (loop_vinfo, stmt_info,
10167                                    new_vec, step_vectype, NULL);
10168
10169       vec_def = induc_def;
10170       for (i = 1; i < ncopies + 1; i++)
10171         {
10172           /* vec_i = vec_prev + vec_step  */
10173           gimple_seq stmts = NULL;
10174           vec_def = gimple_convert (&stmts, step_vectype, vec_def);
10175           vec_def = gimple_build (&stmts,
10176                                   PLUS_EXPR, step_vectype, vec_def, vec_step);
10177           vec_def = gimple_convert (&stmts, vectype, vec_def);
10178
10179           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10180           if (i < ncopies)
10181             {
10182               new_stmt = SSA_NAME_DEF_STMT (vec_def);
10183               STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
10184             }
10185           else
10186             {
10187               /* vec_1 = vec_iv + (VF/n * S)
10188                  vec_2 = vec_1 + (VF/n * S)
10189                  ...
10190                  vec_n = vec_prev + (VF/n * S) = vec_iv + VF * S = vec_loop
10191
10192                  vec_n is used as vec_loop to save the large step register and
10193                  related operations.  */
10194               add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10195                            UNKNOWN_LOCATION);
10196             }
10197         }
10198     }
10199
10200   if (dump_enabled_p ())
10201     dump_printf_loc (MSG_NOTE, vect_location,
10202                      "transform induction: created def-use cycle: %G%G",
10203                      (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
10204
10205   return true;
10206 }
10207
10208 /* Function vectorizable_live_operation.
10209
10210    STMT_INFO computes a value that is used outside the loop.  Check if
10211    it can be supported.  */
10212
10213 bool
10214 vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
10215                              slp_tree slp_node, slp_instance slp_node_instance,
10216                              int slp_index, bool vec_stmt_p,
10217                              stmt_vector_for_cost *cost_vec)
10218 {
10219   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
10220   imm_use_iterator imm_iter;
10221   tree lhs, lhs_type, bitsize;
10222   tree vectype = (slp_node
10223                   ? SLP_TREE_VECTYPE (slp_node)
10224                   : STMT_VINFO_VECTYPE (stmt_info));
10225   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10226   int ncopies;
10227   gimple *use_stmt;
10228   auto_vec<tree> vec_oprnds;
10229   int vec_entry = 0;
10230   poly_uint64 vec_index = 0;
10231
10232   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
10233
10234   /* If a stmt of a reduction is live, vectorize it via
10235      vect_create_epilog_for_reduction.  vectorizable_reduction assessed
10236      validity so just trigger the transform here.  */
10237   if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
10238     {
10239       if (!vec_stmt_p)
10240         return true;
10241       if (slp_node)
10242         {
10243           /* For reduction chains the meta-info is attached to
10244              the group leader.  */
10245           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
10246             stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
10247           /* For SLP reductions we vectorize the epilogue for
10248              all involved stmts together.  */
10249           else if (slp_index != 0)
10250             return true;
10251         }
10252       stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
10253       gcc_assert (reduc_info->is_reduc_info);
10254       if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
10255           || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
10256         return true;
10257       vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
10258                                         slp_node_instance);
10259       return true;
10260     }
10261
10262   /* If STMT is not relevant and it is a simple assignment and its inputs are
10263      invariant then it can remain in place, unvectorized.  The original last
10264      scalar value that it computes will be used.  */
10265   if (!STMT_VINFO_RELEVANT_P (stmt_info))
10266     {
10267       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
10268       if (dump_enabled_p ())
10269         dump_printf_loc (MSG_NOTE, vect_location,
10270                          "statement is simple and uses invariant.  Leaving in "
10271                          "place.\n");
10272       return true;
10273     }
10274
10275   if (slp_node)
10276     ncopies = 1;
10277   else
10278     ncopies = vect_get_num_copies (loop_vinfo, vectype);
10279
10280   if (slp_node)
10281     {
10282       gcc_assert (slp_index >= 0);
10283
10284       /* Get the last occurrence of the scalar index from the concatenation of
10285          all the slp vectors. Calculate which slp vector it is and the index
10286          within.  */
10287       int num_scalar = SLP_TREE_LANES (slp_node);
10288       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10289       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
10290
10291       /* Calculate which vector contains the result, and which lane of
10292          that vector we need.  */
10293       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
10294         {
10295           if (dump_enabled_p ())
10296             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10297                              "Cannot determine which vector holds the"
10298                              " final result.\n");
10299           return false;
10300         }
10301     }
10302
10303   if (!vec_stmt_p)
10304     {
10305       /* No transformation required.  */
10306       if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
10307         {
10308           if (slp_node)
10309             {
10310               if (dump_enabled_p ())
10311                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10312                                  "can't operate on partial vectors "
10313                                  "because an SLP statement is live after "
10314                                  "the loop.\n");
10315               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10316             }
10317           else if (ncopies > 1)
10318             {
10319               if (dump_enabled_p ())
10320                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10321                                  "can't operate on partial vectors "
10322                                  "because ncopies is greater than 1.\n");
10323               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10324             }
10325           else
10326             {
10327               gcc_assert (ncopies == 1 && !slp_node);
10328               if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
10329                                                   OPTIMIZE_FOR_SPEED))
10330                 vect_record_loop_mask (loop_vinfo,
10331                                        &LOOP_VINFO_MASKS (loop_vinfo),
10332                                        1, vectype, NULL);
10333               else if (can_vec_extract_var_idx_p (
10334                          TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
10335                 vect_record_loop_len (loop_vinfo,
10336                                       &LOOP_VINFO_LENS (loop_vinfo),
10337                                       1, vectype, 1);
10338               else
10339                 {
10340                   if (dump_enabled_p ())
10341                     dump_printf_loc (
10342                       MSG_MISSED_OPTIMIZATION, vect_location,
10343                       "can't operate on partial vectors "
10344                       "because the target doesn't support extract "
10345                       "last reduction.\n");
10346                   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10347                 }
10348             }
10349         }
10350       /* ???  Enable for loop costing as well.  */
10351       if (!loop_vinfo)
10352         record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
10353                           0, vect_epilogue);
10354       return true;
10355     }
10356
10357   /* Use the lhs of the original scalar statement.  */
10358   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
10359   if (dump_enabled_p ())
10360     dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
10361                      "stmt %G", stmt);
10362
10363   lhs = gimple_get_lhs (stmt);
10364   lhs_type = TREE_TYPE (lhs);
10365
10366   bitsize = vector_element_bits_tree (vectype);
10367
10368   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
10369   tree vec_lhs, bitstart;
10370   gimple *vec_stmt;
10371   if (slp_node)
10372     {
10373       gcc_assert (!loop_vinfo
10374                   || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
10375                       && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)));
10376
10377       /* Get the correct slp vectorized stmt.  */
10378       vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
10379       vec_stmt = SSA_NAME_DEF_STMT (vec_lhs);
10380
10381       /* Get entry to use.  */
10382       bitstart = bitsize_int (vec_index);
10383       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
10384     }
10385   else
10386     {
10387       /* For multiple copies, get the last copy.  */
10388       vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
10389       vec_lhs = gimple_get_lhs (vec_stmt);
10390
10391       /* Get the last lane in the vector.  */
10392       bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
10393     }
10394
10395   if (loop_vinfo)
10396     {
10397       /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
10398          requirement, insert one phi node for it.  It looks like:
10399            loop;
10400          BB:
10401            # lhs' = PHI <lhs>
10402          ==>
10403            loop;
10404          BB:
10405            # vec_lhs' = PHI <vec_lhs>
10406            new_tree = lane_extract <vec_lhs', ...>;
10407            lhs' = new_tree;  */
10408
10409       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10410       basic_block exit_bb = single_exit (loop)->dest;
10411       gcc_assert (single_pred_p (exit_bb));
10412
10413       tree vec_lhs_phi = copy_ssa_name (vec_lhs);
10414       gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
10415       SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
10416
10417       gimple_seq stmts = NULL;
10418       tree new_tree;
10419       if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
10420         {
10421           /* Emit:
10422
10423                SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
10424
10425              where VEC_LHS is the vectorized live-out result and MASK is
10426              the loop mask for the final iteration.  */
10427           gcc_assert (ncopies == 1 && !slp_node);
10428           gimple_seq tem = NULL;
10429           gimple_stmt_iterator gsi = gsi_last (tem);
10430           tree len
10431             = vect_get_loop_len (loop_vinfo, &gsi,
10432                                  &LOOP_VINFO_LENS (loop_vinfo),
10433                                  1, vectype, 0, 0);
10434
10435           /* BIAS - 1.  */
10436           signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
10437           tree bias_minus_one
10438             = int_const_binop (MINUS_EXPR,
10439                                build_int_cst (TREE_TYPE (len), biasval),
10440                                build_one_cst (TREE_TYPE (len)));
10441
10442           /* LAST_INDEX = LEN + (BIAS - 1).  */
10443           tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
10444                                           len, bias_minus_one);
10445
10446           /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>.  */
10447           tree scalar_res
10448             = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
10449                             vec_lhs_phi, last_index);
10450
10451           /* Convert the extracted vector element to the scalar type.  */
10452           new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10453         }
10454       else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
10455         {
10456           /* Emit:
10457
10458                SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
10459
10460              where VEC_LHS is the vectorized live-out result and MASK is
10461              the loop mask for the final iteration.  */
10462           gcc_assert (ncopies == 1 && !slp_node);
10463           tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
10464           gimple_seq tem = NULL;
10465           gimple_stmt_iterator gsi = gsi_last (tem);
10466           tree mask = vect_get_loop_mask (loop_vinfo, &gsi,
10467                                           &LOOP_VINFO_MASKS (loop_vinfo),
10468                                           1, vectype, 0);
10469           gimple_seq_add_seq (&stmts, tem);
10470           tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
10471                                           mask, vec_lhs_phi);
10472
10473           /* Convert the extracted vector element to the scalar type.  */
10474           new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10475         }
10476       else
10477         {
10478           tree bftype = TREE_TYPE (vectype);
10479           if (VECTOR_BOOLEAN_TYPE_P (vectype))
10480             bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10481           new_tree = build3 (BIT_FIELD_REF, bftype,
10482                              vec_lhs_phi, bitsize, bitstart);
10483           new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10484                                            &stmts, true, NULL_TREE);
10485         }
10486
10487       if (stmts)
10488         {
10489           gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
10490           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
10491
10492           /* Remove existing phi from lhs and create one copy from new_tree.  */
10493           tree lhs_phi = NULL_TREE;
10494           gimple_stmt_iterator gsi;
10495           for (gsi = gsi_start_phis (exit_bb);
10496                !gsi_end_p (gsi); gsi_next (&gsi))
10497             {
10498               gimple *phi = gsi_stmt (gsi);
10499               if ((gimple_phi_arg_def (phi, 0) == lhs))
10500                 {
10501                   remove_phi_node (&gsi, false);
10502                   lhs_phi = gimple_phi_result (phi);
10503                   gimple *copy = gimple_build_assign (lhs_phi, new_tree);
10504                   gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
10505                   break;
10506                 }
10507             }
10508         }
10509
10510       /* Replace use of lhs with newly computed result.  If the use stmt is a
10511          single arg PHI, just replace all uses of PHI result.  It's necessary
10512          because lcssa PHI defining lhs may be before newly inserted stmt.  */
10513       use_operand_p use_p;
10514       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10515         if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
10516             && !is_gimple_debug (use_stmt))
10517           {
10518             if (gimple_code (use_stmt) == GIMPLE_PHI
10519                 && gimple_phi_num_args (use_stmt) == 1)
10520               {
10521                 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
10522               }
10523             else
10524               {
10525                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10526                     SET_USE (use_p, new_tree);
10527               }
10528             update_stmt (use_stmt);
10529           }
10530     }
10531   else
10532     {
10533       /* For basic-block vectorization simply insert the lane-extraction.  */
10534       tree bftype = TREE_TYPE (vectype);
10535       if (VECTOR_BOOLEAN_TYPE_P (vectype))
10536         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10537       tree new_tree = build3 (BIT_FIELD_REF, bftype,
10538                               vec_lhs, bitsize, bitstart);
10539       gimple_seq stmts = NULL;
10540       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10541                                        &stmts, true, NULL_TREE);
10542       if (TREE_CODE (new_tree) == SSA_NAME
10543           && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
10544         SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
10545       if (is_a <gphi *> (vec_stmt))
10546         {
10547           gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
10548           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10549         }
10550       else
10551         {
10552           gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
10553           gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
10554         }
10555
10556       /* Replace use of lhs with newly computed result.  If the use stmt is a
10557          single arg PHI, just replace all uses of PHI result.  It's necessary
10558          because lcssa PHI defining lhs may be before newly inserted stmt.  */
10559       use_operand_p use_p;
10560       stmt_vec_info use_stmt_info;
10561       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10562         if (!is_gimple_debug (use_stmt)
10563             && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
10564                 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
10565           {
10566             /* ???  This can happen when the live lane ends up being
10567                used in a vector construction code-generated by an
10568                external SLP node (and code-generation for that already
10569                happened).  See gcc.dg/vect/bb-slp-47.c.
10570                Doing this is what would happen if that vector CTOR
10571                were not code-generated yet so it is not too bad.
10572                ???  In fact we'd likely want to avoid this situation
10573                in the first place.  */
10574             if (TREE_CODE (new_tree) == SSA_NAME
10575                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10576                 && gimple_code (use_stmt) != GIMPLE_PHI
10577                 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
10578                                                 use_stmt))
10579               {
10580                 enum tree_code code = gimple_assign_rhs_code (use_stmt);
10581                 gcc_checking_assert (code == SSA_NAME
10582                                      || code == CONSTRUCTOR
10583                                      || code == VIEW_CONVERT_EXPR
10584                                      || CONVERT_EXPR_CODE_P (code));
10585                 if (dump_enabled_p ())
10586                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10587                                    "Using original scalar computation for "
10588                                    "live lane because use preceeds vector "
10589                                    "def\n");
10590                 continue;
10591               }
10592             /* ???  It can also happen that we end up pulling a def into
10593                a loop where replacing out-of-loop uses would require
10594                a new LC SSA PHI node.  Retain the original scalar in
10595                those cases as well.  PR98064.  */
10596             if (TREE_CODE (new_tree) == SSA_NAME
10597                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10598                 && (gimple_bb (use_stmt)->loop_father
10599                     != gimple_bb (vec_stmt)->loop_father)
10600                 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
10601                                         gimple_bb (use_stmt)->loop_father))
10602               {
10603                 if (dump_enabled_p ())
10604                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10605                                    "Using original scalar computation for "
10606                                    "live lane because there is an out-of-loop "
10607                                    "definition for it\n");
10608                 continue;
10609               }
10610             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10611               SET_USE (use_p, new_tree);
10612             update_stmt (use_stmt);
10613           }
10614     }
10615
10616   return true;
10617 }
10618
10619 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
10620
10621 static void
10622 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
10623 {
10624   ssa_op_iter op_iter;
10625   imm_use_iterator imm_iter;
10626   def_operand_p def_p;
10627   gimple *ustmt;
10628
10629   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
10630     {
10631       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
10632         {
10633           basic_block bb;
10634
10635           if (!is_gimple_debug (ustmt))
10636             continue;
10637
10638           bb = gimple_bb (ustmt);
10639
10640           if (!flow_bb_inside_loop_p (loop, bb))
10641             {
10642               if (gimple_debug_bind_p (ustmt))
10643                 {
10644                   if (dump_enabled_p ())
10645                     dump_printf_loc (MSG_NOTE, vect_location,
10646                                      "killing debug use\n");
10647
10648                   gimple_debug_bind_reset_value (ustmt);
10649                   update_stmt (ustmt);
10650                 }
10651               else
10652                 gcc_unreachable ();
10653             }
10654         }
10655     }
10656 }
10657
10658 /* Given loop represented by LOOP_VINFO, return true if computation of
10659    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
10660    otherwise.  */
10661
10662 static bool
10663 loop_niters_no_overflow (loop_vec_info loop_vinfo)
10664 {
10665   /* Constant case.  */
10666   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
10667     {
10668       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
10669       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
10670
10671       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
10672       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
10673       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
10674         return true;
10675     }
10676
10677   widest_int max;
10678   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10679   /* Check the upper bound of loop niters.  */
10680   if (get_max_loop_iterations (loop, &max))
10681     {
10682       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
10683       signop sgn = TYPE_SIGN (type);
10684       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
10685       if (max < type_max)
10686         return true;
10687     }
10688   return false;
10689 }
10690
10691 /* Return a mask type with half the number of elements as OLD_TYPE,
10692    given that it should have mode NEW_MODE.  */
10693
10694 tree
10695 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
10696 {
10697   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
10698   return build_truth_vector_type_for_mode (nunits, new_mode);
10699 }
10700
10701 /* Return a mask type with twice as many elements as OLD_TYPE,
10702    given that it should have mode NEW_MODE.  */
10703
10704 tree
10705 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
10706 {
10707   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
10708   return build_truth_vector_type_for_mode (nunits, new_mode);
10709 }
10710
10711 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
10712    contain a sequence of NVECTORS masks that each control a vector of type
10713    VECTYPE.  If SCALAR_MASK is nonnull, the fully-masked loop would AND
10714    these vector masks with the vector version of SCALAR_MASK.  */
10715
10716 void
10717 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
10718                        unsigned int nvectors, tree vectype, tree scalar_mask)
10719 {
10720   gcc_assert (nvectors != 0);
10721
10722   if (scalar_mask)
10723     {
10724       scalar_cond_masked_key cond (scalar_mask, nvectors);
10725       loop_vinfo->scalar_cond_masked_set.add (cond);
10726     }
10727
10728   masks->mask_set.add (std::make_pair (vectype, nvectors));
10729 }
10730
10731 /* Given a complete set of masks MASKS, extract mask number INDEX
10732    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
10733    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
10734
10735    See the comment above vec_loop_masks for more details about the mask
10736    arrangement.  */
10737
10738 tree
10739 vect_get_loop_mask (loop_vec_info loop_vinfo,
10740                     gimple_stmt_iterator *gsi, vec_loop_masks *masks,
10741                     unsigned int nvectors, tree vectype, unsigned int index)
10742 {
10743   if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
10744       == vect_partial_vectors_while_ult)
10745     {
10746       rgroup_controls *rgm = &(masks->rgc_vec)[nvectors - 1];
10747       tree mask_type = rgm->type;
10748
10749       /* Populate the rgroup's mask array, if this is the first time we've
10750          used it.  */
10751       if (rgm->controls.is_empty ())
10752         {
10753           rgm->controls.safe_grow_cleared (nvectors, true);
10754           for (unsigned int i = 0; i < nvectors; ++i)
10755             {
10756               tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
10757               /* Provide a dummy definition until the real one is available.  */
10758               SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
10759               rgm->controls[i] = mask;
10760             }
10761         }
10762
10763       tree mask = rgm->controls[index];
10764       if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
10765                     TYPE_VECTOR_SUBPARTS (vectype)))
10766         {
10767           /* A loop mask for data type X can be reused for data type Y
10768              if X has N times more elements than Y and if Y's elements
10769              are N times bigger than X's.  In this case each sequence
10770              of N elements in the loop mask will be all-zero or all-one.
10771              We can then view-convert the mask so that each sequence of
10772              N elements is replaced by a single element.  */
10773           gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
10774                                   TYPE_VECTOR_SUBPARTS (vectype)));
10775           gimple_seq seq = NULL;
10776           mask_type = truth_type_for (vectype);
10777           mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
10778           if (seq)
10779             gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10780         }
10781       return mask;
10782     }
10783   else if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
10784            == vect_partial_vectors_avx512)
10785     {
10786       /* The number of scalars per iteration and the number of vectors are
10787          both compile-time constants.  */
10788       unsigned int nscalars_per_iter
10789         = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10790                      LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
10791
10792       rgroup_controls *rgm = &masks->rgc_vec[nscalars_per_iter - 1];
10793
10794       /* The stored nV is dependent on the mask type produced.  */
10795       gcc_assert (exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10796                              TYPE_VECTOR_SUBPARTS (rgm->type)).to_constant ()
10797                   == rgm->factor);
10798       nvectors = rgm->factor;
10799
10800       /* Populate the rgroup's mask array, if this is the first time we've
10801          used it.  */
10802       if (rgm->controls.is_empty ())
10803         {
10804           rgm->controls.safe_grow_cleared (nvectors, true);
10805           for (unsigned int i = 0; i < nvectors; ++i)
10806             {
10807               tree mask = make_temp_ssa_name (rgm->type, NULL, "loop_mask");
10808               /* Provide a dummy definition until the real one is available.  */
10809               SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
10810               rgm->controls[i] = mask;
10811             }
10812         }
10813       if (known_eq (TYPE_VECTOR_SUBPARTS (rgm->type),
10814                     TYPE_VECTOR_SUBPARTS (vectype)))
10815         return rgm->controls[index];
10816
10817       /* Split the vector if needed.  Since we are dealing with integer mode
10818          masks with AVX512 we can operate on the integer representation
10819          performing the whole vector shifting.  */
10820       unsigned HOST_WIDE_INT factor;
10821       bool ok = constant_multiple_p (TYPE_VECTOR_SUBPARTS (rgm->type),
10822                                      TYPE_VECTOR_SUBPARTS (vectype), &factor);
10823       gcc_assert (ok);
10824       gcc_assert (GET_MODE_CLASS (TYPE_MODE (rgm->type)) == MODE_INT);
10825       tree mask_type = truth_type_for (vectype);
10826       gcc_assert (GET_MODE_CLASS (TYPE_MODE (mask_type)) == MODE_INT);
10827       unsigned vi = index / factor;
10828       unsigned vpart = index % factor;
10829       tree vec = rgm->controls[vi];
10830       gimple_seq seq = NULL;
10831       vec = gimple_build (&seq, VIEW_CONVERT_EXPR,
10832                           lang_hooks.types.type_for_mode
10833                                 (TYPE_MODE (rgm->type), 1), vec);
10834       /* For integer mode masks simply shift the right bits into position.  */
10835       if (vpart != 0)
10836         vec = gimple_build (&seq, RSHIFT_EXPR, TREE_TYPE (vec), vec,
10837                             build_int_cst (integer_type_node,
10838                                            (TYPE_VECTOR_SUBPARTS (vectype)
10839                                             * vpart)));
10840       vec = gimple_convert (&seq, lang_hooks.types.type_for_mode
10841                                     (TYPE_MODE (mask_type), 1), vec);
10842       vec = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, vec);
10843       if (seq)
10844         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10845       return vec;
10846     }
10847   else
10848     gcc_unreachable ();
10849 }
10850
10851 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
10852    lengths for controlling an operation on VECTYPE.  The operation splits
10853    each element of VECTYPE into FACTOR separate subelements, measuring the
10854    length as a number of these subelements.  */
10855
10856 void
10857 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
10858                       unsigned int nvectors, tree vectype, unsigned int factor)
10859 {
10860   gcc_assert (nvectors != 0);
10861   if (lens->length () < nvectors)
10862     lens->safe_grow_cleared (nvectors, true);
10863   rgroup_controls *rgl = &(*lens)[nvectors - 1];
10864
10865   /* The number of scalars per iteration, scalar occupied bytes and
10866      the number of vectors are both compile-time constants.  */
10867   unsigned int nscalars_per_iter
10868     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10869                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
10870
10871   if (rgl->max_nscalars_per_iter < nscalars_per_iter)
10872     {
10873       /* For now, we only support cases in which all loads and stores fall back
10874          to VnQI or none do.  */
10875       gcc_assert (!rgl->max_nscalars_per_iter
10876                   || (rgl->factor == 1 && factor == 1)
10877                   || (rgl->max_nscalars_per_iter * rgl->factor
10878                       == nscalars_per_iter * factor));
10879       rgl->max_nscalars_per_iter = nscalars_per_iter;
10880       rgl->type = vectype;
10881       rgl->factor = factor;
10882     }
10883 }
10884
10885 /* Given a complete set of lengths LENS, extract length number INDEX
10886    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
10887    where 0 <= INDEX < NVECTORS.  Return a value that contains FACTOR
10888    multipled by the number of elements that should be processed.
10889    Insert any set-up statements before GSI.  */
10890
10891 tree
10892 vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
10893                    vec_loop_lens *lens, unsigned int nvectors, tree vectype,
10894                    unsigned int index, unsigned int factor)
10895 {
10896   rgroup_controls *rgl = &(*lens)[nvectors - 1];
10897   bool use_bias_adjusted_len =
10898     LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
10899
10900   /* Populate the rgroup's len array, if this is the first time we've
10901      used it.  */
10902   if (rgl->controls.is_empty ())
10903     {
10904       rgl->controls.safe_grow_cleared (nvectors, true);
10905       for (unsigned int i = 0; i < nvectors; ++i)
10906         {
10907           tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
10908           gcc_assert (len_type != NULL_TREE);
10909
10910           tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
10911
10912           /* Provide a dummy definition until the real one is available.  */
10913           SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
10914           rgl->controls[i] = len;
10915
10916           if (use_bias_adjusted_len)
10917             {
10918               gcc_assert (i == 0);
10919               tree adjusted_len =
10920                 make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
10921               SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
10922               rgl->bias_adjusted_ctrl = adjusted_len;
10923             }
10924         }
10925     }
10926
10927   if (use_bias_adjusted_len)
10928     return rgl->bias_adjusted_ctrl;
10929
10930   tree loop_len = rgl->controls[index];
10931   if (rgl->factor == 1 && factor == 1)
10932     {
10933       poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
10934       poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
10935       if (maybe_ne (nunits1, nunits2))
10936         {
10937           /* A loop len for data type X can be reused for data type Y
10938              if X has N times more elements than Y and if Y's elements
10939              are N times bigger than X's.  */
10940           gcc_assert (multiple_p (nunits1, nunits2));
10941           factor = exact_div (nunits1, nunits2).to_constant ();
10942           tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
10943           gimple_seq seq = NULL;
10944           loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
10945                                    build_int_cst (iv_type, factor));
10946           if (seq)
10947             gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10948         }
10949     }
10950   return loop_len;
10951 }
10952
10953 /* Scale profiling counters by estimation for LOOP which is vectorized
10954    by factor VF.
10955    If FLAT is true, the loop we started with had unrealistically flat
10956    profile.  */
10957
10958 static void
10959 scale_profile_for_vect_loop (class loop *loop, unsigned vf, bool flat)
10960 {
10961   /* For flat profiles do not scale down proportionally by VF and only
10962      cap by known iteration count bounds.  */
10963   if (flat)
10964     {
10965       if (dump_file && (dump_flags & TDF_DETAILS))
10966         fprintf (dump_file,
10967                  "Vectorized loop profile seems flat; not scaling iteration "
10968                  "count down by the vectorization factor %i\n", vf);
10969       scale_loop_profile (loop, profile_probability::always (),
10970                           get_likely_max_loop_iterations_int (loop));
10971       return;
10972     }
10973   /* Loop body executes VF fewer times and exit increases VF times.  */
10974   edge exit_e = single_exit (loop);
10975   profile_count entry_count = loop_preheader_edge (loop)->count ();
10976
10977   /* If we have unreliable loop profile avoid dropping entry
10978      count bellow header count.  This can happen since loops
10979      has unrealistically low trip counts.  */
10980   while (vf > 1
10981          && loop->header->count > entry_count
10982          && loop->header->count < entry_count * vf)
10983     {
10984       if (dump_file && (dump_flags & TDF_DETAILS))
10985         fprintf (dump_file,
10986                  "Vectorization factor %i seems too large for profile "
10987                  "prevoiusly believed to be consistent; reducing.\n", vf);
10988       vf /= 2;
10989     }
10990
10991   if (entry_count.nonzero_p ())
10992     set_edge_probability_and_rescale_others
10993             (exit_e,
10994              entry_count.probability_in (loop->header->count / vf));
10995   /* Avoid producing very large exit probability when we do not have
10996      sensible profile.  */
10997   else if (exit_e->probability < profile_probability::always () / (vf * 2))
10998     set_edge_probability_and_rescale_others (exit_e, exit_e->probability * vf);
10999   loop->latch->count = single_pred_edge (loop->latch)->count ();
11000
11001   scale_loop_profile (loop, profile_probability::always () / vf,
11002                       get_likely_max_loop_iterations_int (loop));
11003 }
11004
11005 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
11006    latch edge values originally defined by it.  */
11007
11008 static void
11009 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
11010                                      stmt_vec_info def_stmt_info)
11011 {
11012   tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
11013   if (!def || TREE_CODE (def) != SSA_NAME)
11014     return;
11015   stmt_vec_info phi_info;
11016   imm_use_iterator iter;
11017   use_operand_p use_p;
11018   FOR_EACH_IMM_USE_FAST (use_p, iter, def)
11019     {
11020       gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p));
11021       if (!phi)
11022         continue;
11023       if (!(gimple_bb (phi)->loop_father->header == gimple_bb (phi)
11024             && (phi_info = loop_vinfo->lookup_stmt (phi))
11025             && STMT_VINFO_RELEVANT_P (phi_info)))
11026         continue;
11027       loop_p loop = gimple_bb (phi)->loop_father;
11028       edge e = loop_latch_edge (loop);
11029       if (PHI_ARG_DEF_FROM_EDGE (phi, e) != def)
11030         continue;
11031
11032       if (VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
11033           && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
11034           && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
11035         {
11036           vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11037           vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11038           gcc_assert (phi_defs.length () == latch_defs.length ());
11039           for (unsigned i = 0; i < phi_defs.length (); ++i)
11040             add_phi_arg (as_a <gphi *> (phi_defs[i]),
11041                          gimple_get_lhs (latch_defs[i]), e,
11042                          gimple_phi_arg_location (phi, e->dest_idx));
11043         }
11044       else if (STMT_VINFO_DEF_TYPE (phi_info) == vect_first_order_recurrence)
11045         {
11046           /* For first order recurrences we have to update both uses of
11047              the latch definition, the one in the PHI node and the one
11048              in the generated VEC_PERM_EXPR.  */
11049           vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11050           vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11051           gcc_assert (phi_defs.length () == latch_defs.length ());
11052           tree phidef = gimple_assign_rhs1 (phi_defs[0]);
11053           gphi *vphi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
11054           for (unsigned i = 0; i < phi_defs.length (); ++i)
11055             {
11056               gassign *perm = as_a <gassign *> (phi_defs[i]);
11057               if (i > 0)
11058                 gimple_assign_set_rhs1 (perm, gimple_get_lhs (latch_defs[i-1]));
11059               gimple_assign_set_rhs2 (perm, gimple_get_lhs (latch_defs[i]));
11060               update_stmt (perm);
11061             }
11062           add_phi_arg (vphi, gimple_get_lhs (latch_defs.last ()), e,
11063                        gimple_phi_arg_location (phi, e->dest_idx));
11064         }
11065     }
11066 }
11067
11068 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
11069    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
11070    stmt_vec_info.  */
11071
11072 static bool
11073 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
11074                           gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
11075 {
11076   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11077   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11078
11079   if (dump_enabled_p ())
11080     dump_printf_loc (MSG_NOTE, vect_location,
11081                      "------>vectorizing statement: %G", stmt_info->stmt);
11082
11083   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
11084     vect_loop_kill_debug_uses (loop, stmt_info);
11085
11086   if (!STMT_VINFO_RELEVANT_P (stmt_info)
11087       && !STMT_VINFO_LIVE_P (stmt_info))
11088     return false;
11089
11090   if (STMT_VINFO_VECTYPE (stmt_info))
11091     {
11092       poly_uint64 nunits
11093         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
11094       if (!STMT_SLP_TYPE (stmt_info)
11095           && maybe_ne (nunits, vf)
11096           && dump_enabled_p ())
11097         /* For SLP VF is set according to unrolling factor, and not
11098            to vector size, hence for SLP this print is not valid.  */
11099         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
11100     }
11101
11102   /* Pure SLP statements have already been vectorized.  We still need
11103      to apply loop vectorization to hybrid SLP statements.  */
11104   if (PURE_SLP_STMT (stmt_info))
11105     return false;
11106
11107   if (dump_enabled_p ())
11108     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
11109
11110   if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
11111     *seen_store = stmt_info;
11112
11113   return true;
11114 }
11115
11116 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
11117    in the hash_map with its corresponding values.  */
11118
11119 static tree
11120 find_in_mapping (tree t, void *context)
11121 {
11122   hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
11123
11124   tree *value = mapping->get (t);
11125   return value ? *value : t;
11126 }
11127
11128 /* Update EPILOGUE's loop_vec_info.  EPILOGUE was constructed as a copy of the
11129    original loop that has now been vectorized.
11130
11131    The inits of the data_references need to be advanced with the number of
11132    iterations of the main loop.  This has been computed in vect_do_peeling and
11133    is stored in parameter ADVANCE.  We first restore the data_references
11134    initial offset with the values recored in ORIG_DRS_INIT.
11135
11136    Since the loop_vec_info of this EPILOGUE was constructed for the original
11137    loop, its stmt_vec_infos all point to the original statements.  These need
11138    to be updated to point to their corresponding copies as well as the SSA_NAMES
11139    in their PATTERN_DEF_SEQs and RELATED_STMTs.
11140
11141    The data_reference's connections also need to be updated.  Their
11142    corresponding dr_vec_info need to be reconnected to the EPILOGUE's
11143    stmt_vec_infos, their statements need to point to their corresponding copy,
11144    if they are gather loads or scatter stores then their reference needs to be
11145    updated to point to its corresponding copy and finally we set
11146    'base_misaligned' to false as we have already peeled for alignment in the
11147    prologue of the main loop.  */
11148
11149 static void
11150 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
11151 {
11152   loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
11153   auto_vec<gimple *> stmt_worklist;
11154   hash_map<tree,tree> mapping;
11155   gimple *orig_stmt, *new_stmt;
11156   gimple_stmt_iterator epilogue_gsi;
11157   gphi_iterator epilogue_phi_gsi;
11158   stmt_vec_info stmt_vinfo = NULL, related_vinfo;
11159   basic_block *epilogue_bbs = get_loop_body (epilogue);
11160   unsigned i;
11161
11162   free (LOOP_VINFO_BBS (epilogue_vinfo));
11163   LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
11164
11165   /* Advance data_reference's with the number of iterations of the previous
11166      loop and its prologue.  */
11167   vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
11168
11169
11170   /* The EPILOGUE loop is a copy of the original loop so they share the same
11171      gimple UIDs.  In this loop we update the loop_vec_info of the EPILOGUE to
11172      point to the copied statements.  We also create a mapping of all LHS' in
11173      the original loop and all the LHS' in the EPILOGUE and create worklists to
11174      update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs.  */
11175   for (unsigned i = 0; i < epilogue->num_nodes; ++i)
11176     {
11177       for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
11178            !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
11179         {
11180           new_stmt = epilogue_phi_gsi.phi ();
11181
11182           gcc_assert (gimple_uid (new_stmt) > 0);
11183           stmt_vinfo
11184             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11185
11186           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11187           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11188
11189           mapping.put (gimple_phi_result (orig_stmt),
11190                        gimple_phi_result (new_stmt));
11191           /* PHI nodes can not have patterns or related statements.  */
11192           gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
11193                       && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
11194         }
11195
11196       for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
11197            !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
11198         {
11199           new_stmt = gsi_stmt (epilogue_gsi);
11200           if (is_gimple_debug (new_stmt))
11201             continue;
11202
11203           gcc_assert (gimple_uid (new_stmt) > 0);
11204           stmt_vinfo
11205             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11206
11207           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11208           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11209
11210           if (tree old_lhs = gimple_get_lhs (orig_stmt))
11211             mapping.put (old_lhs, gimple_get_lhs (new_stmt));
11212
11213           if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
11214             {
11215               gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
11216               for (gimple_stmt_iterator gsi = gsi_start (seq);
11217                    !gsi_end_p (gsi); gsi_next (&gsi))
11218                 stmt_worklist.safe_push (gsi_stmt (gsi));
11219             }
11220
11221           related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
11222           if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
11223             {
11224               gimple *stmt = STMT_VINFO_STMT (related_vinfo);
11225               stmt_worklist.safe_push (stmt);
11226               /* Set BB such that the assert in
11227                 'get_initial_def_for_reduction' is able to determine that
11228                 the BB of the related stmt is inside this loop.  */
11229               gimple_set_bb (stmt,
11230                              gimple_bb (new_stmt));
11231               related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
11232               gcc_assert (related_vinfo == NULL
11233                           || related_vinfo == stmt_vinfo);
11234             }
11235         }
11236     }
11237
11238   /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
11239      using the original main loop and thus need to be updated to refer to the
11240      cloned variables used in the epilogue.  */
11241   for (unsigned i = 0; i < stmt_worklist.length (); ++i)
11242     {
11243       gimple *stmt = stmt_worklist[i];
11244       tree *new_op;
11245
11246       for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
11247         {
11248           tree op = gimple_op (stmt, j);
11249           if ((new_op = mapping.get(op)))
11250             gimple_set_op (stmt, j, *new_op);
11251           else
11252             {
11253               /* PR92429: The last argument of simplify_replace_tree disables
11254                  folding when replacing arguments.  This is required as
11255                  otherwise you might end up with different statements than the
11256                  ones analyzed in vect_loop_analyze, leading to different
11257                  vectorization.  */
11258               op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
11259                                           &find_in_mapping, &mapping, false);
11260               gimple_set_op (stmt, j, op);
11261             }
11262         }
11263     }
11264
11265   struct data_reference *dr;
11266   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
11267   FOR_EACH_VEC_ELT (datarefs, i, dr)
11268     {
11269       orig_stmt = DR_STMT (dr);
11270       gcc_assert (gimple_uid (orig_stmt) > 0);
11271       stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
11272       /* Data references for gather loads and scatter stores do not use the
11273          updated offset we set using ADVANCE.  Instead we have to make sure the
11274          reference in the data references point to the corresponding copy of
11275          the original in the epilogue.  */
11276       if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
11277           == VMAT_GATHER_SCATTER)
11278         {
11279           DR_REF (dr)
11280             = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
11281                                      &find_in_mapping, &mapping);
11282           DR_BASE_ADDRESS (dr)
11283             = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
11284                                      &find_in_mapping, &mapping);
11285         }
11286       DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
11287       stmt_vinfo->dr_aux.stmt = stmt_vinfo;
11288       /* The vector size of the epilogue is smaller than that of the main loop
11289          so the alignment is either the same or lower. This means the dr will
11290          thus by definition be aligned.  */
11291       STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
11292     }
11293
11294   epilogue_vinfo->shared->datarefs_copy.release ();
11295   epilogue_vinfo->shared->save_datarefs ();
11296 }
11297
11298 /* Function vect_transform_loop.
11299
11300    The analysis phase has determined that the loop is vectorizable.
11301    Vectorize the loop - created vectorized stmts to replace the scalar
11302    stmts in the loop, and update the loop exit condition.
11303    Returns scalar epilogue loop if any.  */
11304
11305 class loop *
11306 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
11307 {
11308   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11309   class loop *epilogue = NULL;
11310   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
11311   int nbbs = loop->num_nodes;
11312   int i;
11313   tree niters_vector = NULL_TREE;
11314   tree step_vector = NULL_TREE;
11315   tree niters_vector_mult_vf = NULL_TREE;
11316   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11317   unsigned int lowest_vf = constant_lower_bound (vf);
11318   gimple *stmt;
11319   bool check_profitability = false;
11320   unsigned int th;
11321   bool flat = maybe_flat_loop_profile (loop);
11322
11323   DUMP_VECT_SCOPE ("vec_transform_loop");
11324
11325   loop_vinfo->shared->check_datarefs ();
11326
11327   /* Use the more conservative vectorization threshold.  If the number
11328      of iterations is constant assume the cost check has been performed
11329      by our caller.  If the threshold makes all loops profitable that
11330      run at least the (estimated) vectorization factor number of times
11331      checking is pointless, too.  */
11332   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
11333   if (vect_apply_runtime_profitability_check_p (loop_vinfo))
11334     {
11335       if (dump_enabled_p ())
11336         dump_printf_loc (MSG_NOTE, vect_location,
11337                          "Profitability threshold is %d loop iterations.\n",
11338                          th);
11339       check_profitability = true;
11340     }
11341
11342   /* Make sure there exists a single-predecessor exit bb.  Do this before
11343      versioning.   */
11344   edge e = single_exit (loop);
11345   if (! single_pred_p (e->dest))
11346     {
11347       split_loop_exit_edge (e, true);
11348       if (dump_enabled_p ())
11349         dump_printf (MSG_NOTE, "split exit edge\n");
11350     }
11351
11352   /* Version the loop first, if required, so the profitability check
11353      comes first.  */
11354
11355   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
11356     {
11357       class loop *sloop
11358         = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
11359       sloop->force_vectorize = false;
11360       check_profitability = false;
11361     }
11362
11363   /* Make sure there exists a single-predecessor exit bb also on the
11364      scalar loop copy.  Do this after versioning but before peeling
11365      so CFG structure is fine for both scalar and if-converted loop
11366      to make slpeel_duplicate_current_defs_from_edges face matched
11367      loop closed PHI nodes on the exit.  */
11368   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
11369     {
11370       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
11371       if (! single_pred_p (e->dest))
11372         {
11373           split_loop_exit_edge (e, true);
11374           if (dump_enabled_p ())
11375             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
11376         }
11377     }
11378
11379   tree niters = vect_build_loop_niters (loop_vinfo);
11380   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
11381   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
11382   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
11383   tree advance;
11384   drs_init_vec orig_drs_init;
11385
11386   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
11387                               &step_vector, &niters_vector_mult_vf, th,
11388                               check_profitability, niters_no_overflow,
11389                               &advance);
11390   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
11391       && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
11392     {
11393       /* Ifcvt duplicates loop preheader, loop body and produces an basic
11394          block after loop exit.  We need to scale all that.  */
11395       basic_block preheader
11396         = loop_preheader_edge (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->src;
11397       preheader->count
11398         = preheader->count.apply_probability
11399               (LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11400       scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
11401                               LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11402       single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->dest->count
11403         = preheader->count;
11404     }
11405
11406   if (niters_vector == NULL_TREE)
11407     {
11408       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
11409           && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
11410           && known_eq (lowest_vf, vf))
11411         {
11412           niters_vector
11413             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
11414                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
11415           step_vector = build_one_cst (TREE_TYPE (niters));
11416         }
11417       else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11418         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
11419                                      &step_vector, niters_no_overflow);
11420       else
11421         /* vect_do_peeling subtracted the number of peeled prologue
11422            iterations from LOOP_VINFO_NITERS.  */
11423         vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
11424                                      &niters_vector, &step_vector,
11425                                      niters_no_overflow);
11426     }
11427
11428   /* 1) Make sure the loop header has exactly two entries
11429      2) Make sure we have a preheader basic block.  */
11430
11431   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
11432
11433   split_edge (loop_preheader_edge (loop));
11434
11435   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11436     /* This will deal with any possible peeling.  */
11437     vect_prepare_for_masked_peels (loop_vinfo);
11438
11439   /* Schedule the SLP instances first, then handle loop vectorization
11440      below.  */
11441   if (!loop_vinfo->slp_instances.is_empty ())
11442     {
11443       DUMP_VECT_SCOPE ("scheduling SLP instances");
11444       vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
11445     }
11446
11447   /* FORNOW: the vectorizer supports only loops which body consist
11448      of one basic block (header + empty latch). When the vectorizer will
11449      support more involved loop forms, the order by which the BBs are
11450      traversed need to be reconsidered.  */
11451
11452   for (i = 0; i < nbbs; i++)
11453     {
11454       basic_block bb = bbs[i];
11455       stmt_vec_info stmt_info;
11456
11457       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
11458            gsi_next (&si))
11459         {
11460           gphi *phi = si.phi ();
11461           if (dump_enabled_p ())
11462             dump_printf_loc (MSG_NOTE, vect_location,
11463                              "------>vectorizing phi: %G", (gimple *) phi);
11464           stmt_info = loop_vinfo->lookup_stmt (phi);
11465           if (!stmt_info)
11466             continue;
11467
11468           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
11469             vect_loop_kill_debug_uses (loop, stmt_info);
11470
11471           if (!STMT_VINFO_RELEVANT_P (stmt_info)
11472               && !STMT_VINFO_LIVE_P (stmt_info))
11473             continue;
11474
11475           if (STMT_VINFO_VECTYPE (stmt_info)
11476               && (maybe_ne
11477                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
11478               && dump_enabled_p ())
11479             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
11480
11481           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
11482                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
11483                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
11484                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
11485                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence
11486                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
11487               && ! PURE_SLP_STMT (stmt_info))
11488             {
11489               if (dump_enabled_p ())
11490                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
11491               vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
11492             }
11493         }
11494
11495       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
11496            gsi_next (&si))
11497         {
11498           gphi *phi = si.phi ();
11499           stmt_info = loop_vinfo->lookup_stmt (phi);
11500           if (!stmt_info)
11501             continue;
11502
11503           if (!STMT_VINFO_RELEVANT_P (stmt_info)
11504               && !STMT_VINFO_LIVE_P (stmt_info))
11505             continue;
11506
11507           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
11508                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
11509                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
11510                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
11511                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
11512                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence)
11513               && ! PURE_SLP_STMT (stmt_info))
11514             maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
11515         }
11516
11517       for (gimple_stmt_iterator si = gsi_start_bb (bb);
11518            !gsi_end_p (si);)
11519         {
11520           stmt = gsi_stmt (si);
11521           /* During vectorization remove existing clobber stmts.  */
11522           if (gimple_clobber_p (stmt))
11523             {
11524               unlink_stmt_vdef (stmt);
11525               gsi_remove (&si, true);
11526               release_defs (stmt);
11527             }
11528           else
11529             {
11530               /* Ignore vector stmts created in the outer loop.  */
11531               stmt_info = loop_vinfo->lookup_stmt (stmt);
11532
11533               /* vector stmts created in the outer-loop during vectorization of
11534                  stmts in an inner-loop may not have a stmt_info, and do not
11535                  need to be vectorized.  */
11536               stmt_vec_info seen_store = NULL;
11537               if (stmt_info)
11538                 {
11539                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
11540                     {
11541                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
11542                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
11543                            !gsi_end_p (subsi); gsi_next (&subsi))
11544                         {
11545                           stmt_vec_info pat_stmt_info
11546                             = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
11547                           vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
11548                                                     &si, &seen_store);
11549                         }
11550                       stmt_vec_info pat_stmt_info
11551                         = STMT_VINFO_RELATED_STMT (stmt_info);
11552                       if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
11553                                                     &si, &seen_store))
11554                         maybe_set_vectorized_backedge_value (loop_vinfo,
11555                                                              pat_stmt_info);
11556                     }
11557                   else
11558                     {
11559                       if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
11560                                                     &seen_store))
11561                         maybe_set_vectorized_backedge_value (loop_vinfo,
11562                                                              stmt_info);
11563                     }
11564                 }
11565               gsi_next (&si);
11566               if (seen_store)
11567                 {
11568                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
11569                     /* Interleaving.  If IS_STORE is TRUE, the
11570                        vectorization of the interleaving chain was
11571                        completed - free all the stores in the chain.  */
11572                     vect_remove_stores (loop_vinfo,
11573                                         DR_GROUP_FIRST_ELEMENT (seen_store));
11574                   else
11575                     /* Free the attached stmt_vec_info and remove the stmt.  */
11576                     loop_vinfo->remove_stmt (stmt_info);
11577                 }
11578             }
11579         }
11580
11581       /* Stub out scalar statements that must not survive vectorization.
11582          Doing this here helps with grouped statements, or statements that
11583          are involved in patterns.  */
11584       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
11585            !gsi_end_p (gsi); gsi_next (&gsi))
11586         {
11587           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
11588           if (!call || !gimple_call_internal_p (call))
11589             continue;
11590           internal_fn ifn = gimple_call_internal_fn (call);
11591           if (ifn == IFN_MASK_LOAD)
11592             {
11593               tree lhs = gimple_get_lhs (call);
11594               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11595                 {
11596                   tree zero = build_zero_cst (TREE_TYPE (lhs));
11597                   gimple *new_stmt = gimple_build_assign (lhs, zero);
11598                   gsi_replace (&gsi, new_stmt, true);
11599                 }
11600             }
11601           else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
11602             {
11603               tree lhs = gimple_get_lhs (call);
11604               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11605                 {
11606                   tree else_arg
11607                     = gimple_call_arg (call, gimple_call_num_args (call) - 1);
11608                   gimple *new_stmt = gimple_build_assign (lhs, else_arg);
11609                   gsi_replace (&gsi, new_stmt, true);
11610                 }
11611             }
11612         }
11613     }                           /* BBs in loop */
11614
11615   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
11616      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
11617   if (integer_onep (step_vector))
11618     niters_no_overflow = true;
11619   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
11620                            niters_vector_mult_vf, !niters_no_overflow);
11621
11622   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
11623
11624   /* True if the final iteration might not handle a full vector's
11625      worth of scalar iterations.  */
11626   bool final_iter_may_be_partial
11627     = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
11628   /* The minimum number of iterations performed by the epilogue.  This
11629      is 1 when peeling for gaps because we always need a final scalar
11630      iteration.  */
11631   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
11632   /* +1 to convert latch counts to loop iteration counts,
11633      -min_epilogue_iters to remove iterations that cannot be performed
11634        by the vector code.  */
11635   int bias_for_lowest = 1 - min_epilogue_iters;
11636   int bias_for_assumed = bias_for_lowest;
11637   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
11638   if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
11639     {
11640       /* When the amount of peeling is known at compile time, the first
11641          iteration will have exactly alignment_npeels active elements.
11642          In the worst case it will have at least one.  */
11643       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
11644       bias_for_lowest += lowest_vf - min_first_active;
11645       bias_for_assumed += assumed_vf - min_first_active;
11646     }
11647   /* In these calculations the "- 1" converts loop iteration counts
11648      back to latch counts.  */
11649   if (loop->any_upper_bound)
11650     {
11651       loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
11652       loop->nb_iterations_upper_bound
11653         = (final_iter_may_be_partial
11654            ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
11655                             lowest_vf) - 1
11656            : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
11657                              lowest_vf) - 1);
11658       if (main_vinfo
11659           /* Both peeling for alignment and peeling for gaps can end up
11660              with the scalar epilogue running for more than VF-1 iterations.  */
11661           && !main_vinfo->peeling_for_alignment
11662           && !main_vinfo->peeling_for_gaps)
11663         {
11664           unsigned int bound;
11665           poly_uint64 main_iters
11666             = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
11667                            LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
11668           main_iters
11669             = upper_bound (main_iters,
11670                            LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
11671           if (can_div_away_from_zero_p (main_iters,
11672                                         LOOP_VINFO_VECT_FACTOR (loop_vinfo),
11673                                         &bound))
11674             loop->nb_iterations_upper_bound
11675               = wi::umin ((bound_wide_int) (bound - 1),
11676                           loop->nb_iterations_upper_bound);
11677       }
11678   }
11679   if (loop->any_likely_upper_bound)
11680     loop->nb_iterations_likely_upper_bound
11681       = (final_iter_may_be_partial
11682          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
11683                           + bias_for_lowest, lowest_vf) - 1
11684          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
11685                            + bias_for_lowest, lowest_vf) - 1);
11686   if (loop->any_estimate)
11687     loop->nb_iterations_estimate
11688       = (final_iter_may_be_partial
11689          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
11690                           assumed_vf) - 1
11691          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
11692                            assumed_vf) - 1);
11693   scale_profile_for_vect_loop (loop, assumed_vf, flat);
11694
11695   if (dump_enabled_p ())
11696     {
11697       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
11698         {
11699           dump_printf_loc (MSG_NOTE, vect_location,
11700                            "LOOP VECTORIZED\n");
11701           if (loop->inner)
11702             dump_printf_loc (MSG_NOTE, vect_location,
11703                              "OUTER LOOP VECTORIZED\n");
11704           dump_printf (MSG_NOTE, "\n");
11705         }
11706       else
11707         dump_printf_loc (MSG_NOTE, vect_location,
11708                          "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
11709                          GET_MODE_NAME (loop_vinfo->vector_mode));
11710     }
11711
11712   /* Loops vectorized with a variable factor won't benefit from
11713      unrolling/peeling.  */
11714   if (!vf.is_constant ())
11715     {
11716       loop->unroll = 1;
11717       if (dump_enabled_p ())
11718         dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
11719                          " variable-length vectorization factor\n");
11720     }
11721   /* Free SLP instances here because otherwise stmt reference counting
11722      won't work.  */
11723   slp_instance instance;
11724   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
11725     vect_free_slp_instance (instance);
11726   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
11727   /* Clear-up safelen field since its value is invalid after vectorization
11728      since vectorized loop can have loop-carried dependencies.  */
11729   loop->safelen = 0;
11730
11731   if (epilogue)
11732     {
11733       update_epilogue_loop_vinfo (epilogue, advance);
11734
11735       epilogue->simduid = loop->simduid;
11736       epilogue->force_vectorize = loop->force_vectorize;
11737       epilogue->dont_vectorize = false;
11738     }
11739
11740   return epilogue;
11741 }
11742
11743 /* The code below is trying to perform simple optimization - revert
11744    if-conversion for masked stores, i.e. if the mask of a store is zero
11745    do not perform it and all stored value producers also if possible.
11746    For example,
11747      for (i=0; i<n; i++)
11748        if (c[i])
11749         {
11750           p1[i] += 1;
11751           p2[i] = p3[i] +2;
11752         }
11753    this transformation will produce the following semi-hammock:
11754
11755    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
11756      {
11757        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
11758        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
11759        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
11760        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
11761        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
11762        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
11763      }
11764 */
11765
11766 void
11767 optimize_mask_stores (class loop *loop)
11768 {
11769   basic_block *bbs = get_loop_body (loop);
11770   unsigned nbbs = loop->num_nodes;
11771   unsigned i;
11772   basic_block bb;
11773   class loop *bb_loop;
11774   gimple_stmt_iterator gsi;
11775   gimple *stmt;
11776   auto_vec<gimple *> worklist;
11777   auto_purge_vect_location sentinel;
11778
11779   vect_location = find_loop_location (loop);
11780   /* Pick up all masked stores in loop if any.  */
11781   for (i = 0; i < nbbs; i++)
11782     {
11783       bb = bbs[i];
11784       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
11785            gsi_next (&gsi))
11786         {
11787           stmt = gsi_stmt (gsi);
11788           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
11789             worklist.safe_push (stmt);
11790         }
11791     }
11792
11793   free (bbs);
11794   if (worklist.is_empty ())
11795     return;
11796
11797   /* Loop has masked stores.  */
11798   while (!worklist.is_empty ())
11799     {
11800       gimple *last, *last_store;
11801       edge e, efalse;
11802       tree mask;
11803       basic_block store_bb, join_bb;
11804       gimple_stmt_iterator gsi_to;
11805       tree vdef, new_vdef;
11806       gphi *phi;
11807       tree vectype;
11808       tree zero;
11809
11810       last = worklist.pop ();
11811       mask = gimple_call_arg (last, 2);
11812       bb = gimple_bb (last);
11813       /* Create then_bb and if-then structure in CFG, then_bb belongs to
11814          the same loop as if_bb.  It could be different to LOOP when two
11815          level loop-nest is vectorized and mask_store belongs to the inner
11816          one.  */
11817       e = split_block (bb, last);
11818       bb_loop = bb->loop_father;
11819       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
11820       join_bb = e->dest;
11821       store_bb = create_empty_bb (bb);
11822       add_bb_to_loop (store_bb, bb_loop);
11823       e->flags = EDGE_TRUE_VALUE;
11824       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
11825       /* Put STORE_BB to likely part.  */
11826       efalse->probability = profile_probability::likely ();
11827       e->probability = efalse->probability.invert ();
11828       store_bb->count = efalse->count ();
11829       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
11830       if (dom_info_available_p (CDI_DOMINATORS))
11831         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
11832       if (dump_enabled_p ())
11833         dump_printf_loc (MSG_NOTE, vect_location,
11834                          "Create new block %d to sink mask stores.",
11835                          store_bb->index);
11836       /* Create vector comparison with boolean result.  */
11837       vectype = TREE_TYPE (mask);
11838       zero = build_zero_cst (vectype);
11839       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
11840       gsi = gsi_last_bb (bb);
11841       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
11842       /* Create new PHI node for vdef of the last masked store:
11843          .MEM_2 = VDEF <.MEM_1>
11844          will be converted to
11845          .MEM.3 = VDEF <.MEM_1>
11846          and new PHI node will be created in join bb
11847          .MEM_2 = PHI <.MEM_1, .MEM_3>
11848       */
11849       vdef = gimple_vdef (last);
11850       new_vdef = make_ssa_name (gimple_vop (cfun), last);
11851       gimple_set_vdef (last, new_vdef);
11852       phi = create_phi_node (vdef, join_bb);
11853       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
11854
11855       /* Put all masked stores with the same mask to STORE_BB if possible.  */
11856       while (true)
11857         {
11858           gimple_stmt_iterator gsi_from;
11859           gimple *stmt1 = NULL;
11860
11861           /* Move masked store to STORE_BB.  */
11862           last_store = last;
11863           gsi = gsi_for_stmt (last);
11864           gsi_from = gsi;
11865           /* Shift GSI to the previous stmt for further traversal.  */
11866           gsi_prev (&gsi);
11867           gsi_to = gsi_start_bb (store_bb);
11868           gsi_move_before (&gsi_from, &gsi_to);
11869           /* Setup GSI_TO to the non-empty block start.  */
11870           gsi_to = gsi_start_bb (store_bb);
11871           if (dump_enabled_p ())
11872             dump_printf_loc (MSG_NOTE, vect_location,
11873                              "Move stmt to created bb\n%G", last);
11874           /* Move all stored value producers if possible.  */
11875           while (!gsi_end_p (gsi))
11876             {
11877               tree lhs;
11878               imm_use_iterator imm_iter;
11879               use_operand_p use_p;
11880               bool res;
11881
11882               /* Skip debug statements.  */
11883               if (is_gimple_debug (gsi_stmt (gsi)))
11884                 {
11885                   gsi_prev (&gsi);
11886                   continue;
11887                 }
11888               stmt1 = gsi_stmt (gsi);
11889               /* Do not consider statements writing to memory or having
11890                  volatile operand.  */
11891               if (gimple_vdef (stmt1)
11892                   || gimple_has_volatile_ops (stmt1))
11893                 break;
11894               gsi_from = gsi;
11895               gsi_prev (&gsi);
11896               lhs = gimple_get_lhs (stmt1);
11897               if (!lhs)
11898                 break;
11899
11900               /* LHS of vectorized stmt must be SSA_NAME.  */
11901               if (TREE_CODE (lhs) != SSA_NAME)
11902                 break;
11903
11904               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11905                 {
11906                   /* Remove dead scalar statement.  */
11907                   if (has_zero_uses (lhs))
11908                     {
11909                       gsi_remove (&gsi_from, true);
11910                       continue;
11911                     }
11912                 }
11913
11914               /* Check that LHS does not have uses outside of STORE_BB.  */
11915               res = true;
11916               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
11917                 {
11918                   gimple *use_stmt;
11919                   use_stmt = USE_STMT (use_p);
11920                   if (is_gimple_debug (use_stmt))
11921                     continue;
11922                   if (gimple_bb (use_stmt) != store_bb)
11923                     {
11924                       res = false;
11925                       break;
11926                     }
11927                 }
11928               if (!res)
11929                 break;
11930
11931               if (gimple_vuse (stmt1)
11932                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
11933                 break;
11934
11935               /* Can move STMT1 to STORE_BB.  */
11936               if (dump_enabled_p ())
11937                 dump_printf_loc (MSG_NOTE, vect_location,
11938                                  "Move stmt to created bb\n%G", stmt1);
11939               gsi_move_before (&gsi_from, &gsi_to);
11940               /* Shift GSI_TO for further insertion.  */
11941               gsi_prev (&gsi_to);
11942             }
11943           /* Put other masked stores with the same mask to STORE_BB.  */
11944           if (worklist.is_empty ()
11945               || gimple_call_arg (worklist.last (), 2) != mask
11946               || worklist.last () != stmt1)
11947             break;
11948           last = worklist.pop ();
11949         }
11950       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
11951     }
11952 }
11953
11954 /* Decide whether it is possible to use a zero-based induction variable
11955    when vectorizing LOOP_VINFO with partial vectors.  If it is, return
11956    the value that the induction variable must be able to hold in order
11957    to ensure that the rgroups eventually have no active vector elements.
11958    Return -1 otherwise.  */
11959
11960 widest_int
11961 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
11962 {
11963   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
11964   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11965   unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
11966
11967   /* Calculate the value that the induction variable must be able
11968      to hit in order to ensure that we end the loop with an all-false mask.
11969      This involves adding the maximum number of inactive trailing scalar
11970      iterations.  */
11971   widest_int iv_limit = -1;
11972   if (max_loop_iterations (loop, &iv_limit))
11973     {
11974       if (niters_skip)
11975         {
11976           /* Add the maximum number of skipped iterations to the
11977              maximum iteration count.  */
11978           if (TREE_CODE (niters_skip) == INTEGER_CST)
11979             iv_limit += wi::to_widest (niters_skip);
11980           else
11981             iv_limit += max_vf - 1;
11982         }
11983       else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
11984         /* Make a conservatively-correct assumption.  */
11985         iv_limit += max_vf - 1;
11986
11987       /* IV_LIMIT is the maximum number of latch iterations, which is also
11988          the maximum in-range IV value.  Round this value down to the previous
11989          vector alignment boundary and then add an extra full iteration.  */
11990       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11991       iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
11992     }
11993   return iv_limit;
11994 }
11995
11996 /* For the given rgroup_controls RGC, check whether an induction variable
11997    would ever hit a value that produces a set of all-false masks or zero
11998    lengths before wrapping around.  Return true if it's possible to wrap
11999    around before hitting the desirable value, otherwise return false.  */
12000
12001 bool
12002 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
12003 {
12004   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
12005
12006   if (iv_limit == -1)
12007     return true;
12008
12009   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
12010   unsigned int compare_precision = TYPE_PRECISION (compare_type);
12011   unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
12012
12013   if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
12014     return true;
12015
12016   return false;
12017 }