gcc/tree-vect-loop.cc

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2023 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #define INCLUDE_ALGORITHM
  23 #include "config.h"
  24 #include "system.h"
  25 #include "coretypes.h"
  26 #include "backend.h"
  27 #include "target.h"
  28 #include "rtl.h"
  29 #include "tree.h"
  30 #include "gimple.h"
  31 #include "cfghooks.h"
  32 #include "tree-pass.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "memmodel.h"
  36 #include "optabs.h"
  37 #include "diagnostic-core.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "cfganal.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop-niter.h"
  47 #include "tree-ssa-loop.h"
  48 #include "cfgloop.h"
  49 #include "tree-scalar-evolution.h"
  50 #include "tree-vectorizer.h"
  51 #include "gimple-fold.h"
  52 #include "cgraph.h"
  53 #include "tree-cfg.h"
  54 #include "tree-if-conv.h"
  55 #include "internal-fn.h"
  56 #include "tree-vector-builder.h"
  57 #include "vec-perm-indices.h"
  58 #include "tree-eh.h"
  59 #include "case-cfn-macros.h"
  60 #include "langhooks.h"
  61
  62 /* Loop Vectorization Pass.
  63
  64    This pass tries to vectorize loops.
  65
  66    For example, the vectorizer transforms the following simple loop:
  67
  68         short a[N]; short b[N]; short c[N]; int i;
  69
  70         for (i=0; i<N; i++){
  71           a[i] = b[i] + c[i];
  72         }
  73
  74    as if it was manually vectorized by rewriting the source code into:
  75
  76         typedef int __attribute__((mode(V8HI))) v8hi;
  77         short a[N];  short b[N]; short c[N];   int i;
  78         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  79         v8hi va, vb, vc;
  80
  81         for (i=0; i<N/8; i++){
  82           vb = pb[i];
  83           vc = pc[i];
  84           va = vb + vc;
  85           pa[i] = va;
  86         }
  87
  88         The main entry to this pass is vectorize_loops(), in which
  89    the vectorizer applies a set of analyses on a given set of loops,
  90    followed by the actual vectorization transformation for the loops that
  91    had successfully passed the analysis phase.
  92         Throughout this pass we make a distinction between two types of
  93    data: scalars (which are represented by SSA_NAMES), and memory references
  94    ("data-refs").  These two types of data require different handling both
  95    during analysis and transformation. The types of data-refs that the
  96    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  97    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  98    accesses are required to have a simple (consecutive) access pattern.
  99
 100    Analysis phase:
 101    ===============
 102         The driver for the analysis phase is vect_analyze_loop().
 103    It applies a set of analyses, some of which rely on the scalar evolution
 104    analyzer (scev) developed by Sebastian Pop.
 105
 106         During the analysis phase the vectorizer records some information
 107    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 108    loop, as well as general information about the loop as a whole, which is
 109    recorded in a "loop_vec_info" struct attached to each loop.
 110
 111    Transformation phase:
 112    =====================
 113         The loop transformation phase scans all the stmts in the loop, and
 114    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 115    the loop that needs to be vectorized.  It inserts the vector code sequence
 116    just before the scalar stmt S, and records a pointer to the vector code
 117    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 118    attached to S).  This pointer will be used for the vectorization of following
 119    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 120    otherwise, we rely on dead code elimination for removing it.
 121
 122         For example, say stmt S1 was vectorized into stmt VS1:
 123
 124    VS1: vb = px[i];
 125    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 126    S2:  a = b;
 127
 128    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 129    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 130    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 131    resulting sequence would be:
 132
 133    VS1: vb = px[i];
 134    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 135    VS2: va = vb;
 136    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 137
 138         Operands that are not SSA_NAMEs, are data-refs that appear in
 139    load/store operations (like 'x[i]' in S1), and are handled differently.
 140
 141    Target modeling:
 142    =================
 143         Currently the only target specific information that is used is the
 144    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 145    Targets that can support different sizes of vectors, for now will need
 146    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 147    flexibility will be added in the future.
 148
 149         Since we only vectorize operations which vector form can be
 150    expressed using existing tree codes, to verify that an operation is
 151    supported, the vectorizer checks the relevant optab at the relevant
 152    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 153    the value found is CODE_FOR_nothing, then there's no target support, and
 154    we can't vectorize the stmt.
 155
 156    For additional information on this project see:
 157    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 158 */
 159
 160 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
 161                                                 unsigned *);
 162 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
 163                                                bool *, bool *, bool);
 164
 165 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 166    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 167    may already be set for general statements (not just data refs).  */
 168
 169 static opt_result
 170 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
 171                               bool vectype_maybe_set_p,
 172                               poly_uint64 *vf)
 173 {
 174   gimple *stmt = stmt_info->stmt;
 175
 176   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 177        && !STMT_VINFO_LIVE_P (stmt_info))
 178       || gimple_clobber_p (stmt))
 179     {
 180       if (dump_enabled_p ())
 181         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 182       return opt_result::success ();
 183     }
 184
 185   tree stmt_vectype, nunits_vectype;
 186   opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
 187                                                    &stmt_vectype,
 188                                                    &nunits_vectype);
 189   if (!res)
 190     return res;
 191
 192   if (stmt_vectype)
 193     {
 194       if (STMT_VINFO_VECTYPE (stmt_info))
 195         /* The only case when a vectype had been already set is for stmts
 196            that contain a data ref, or for "pattern-stmts" (stmts generated
 197            by the vectorizer to represent/replace a certain idiom).  */
 198         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 199                      || vectype_maybe_set_p)
 200                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 201       else
 202         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 203     }
 204
 205   if (nunits_vectype)
 206     vect_update_max_nunits (vf, nunits_vectype);
 207
 208   return opt_result::success ();
 209 }
 210
 211 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 212    types of STMT_INFO and all attached pattern statements and update
 213    the vectorization factor VF accordingly.  Return true on success
 214    or false if something prevented vectorization.  */
 215
 216 static opt_result
 217 vect_determine_vf_for_stmt (vec_info *vinfo,
 218                             stmt_vec_info stmt_info, poly_uint64 *vf)
 219 {
 220   if (dump_enabled_p ())
 221     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
 222                      stmt_info->stmt);
 223   opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
 224   if (!res)
 225     return res;
 226
 227   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 228       && STMT_VINFO_RELATED_STMT (stmt_info))
 229     {
 230       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 231       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
 232
 233       /* If a pattern statement has def stmts, analyze them too.  */
 234       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 235            !gsi_end_p (si); gsi_next (&si))
 236         {
 237           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 238           if (dump_enabled_p ())
 239             dump_printf_loc (MSG_NOTE, vect_location,
 240                              "==> examining pattern def stmt: %G",
 241                              def_stmt_info->stmt);
 242           res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
 243           if (!res)
 244             return res;
 245         }
 246
 247       if (dump_enabled_p ())
 248         dump_printf_loc (MSG_NOTE, vect_location,
 249                          "==> examining pattern statement: %G",
 250                          stmt_info->stmt);
 251       res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
 252       if (!res)
 253         return res;
 254     }
 255
 256   return opt_result::success ();
 257 }
 258
 259 /* Function vect_determine_vectorization_factor
 260
 261    Determine the vectorization factor (VF).  VF is the number of data elements
 262    that are operated upon in parallel in a single iteration of the vectorized
 263    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 264    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 265    elements can fit in a single vector register.
 266
 267    We currently support vectorization of loops in which all types operated upon
 268    are of the same size.  Therefore this function currently sets VF according to
 269    the size of the types operated upon, and fails if there are multiple sizes
 270    in the loop.
 271
 272    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 273    original loop:
 274         for (i=0; i<N; i++){
 275           a[i] = b[i] + c[i];
 276         }
 277
 278    vectorized loop:
 279         for (i=0; i<N; i+=VF){
 280           a[i:VF] = b[i:VF] + c[i:VF];
 281         }
 282 */
 283
 284 static opt_result
 285 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 286 {
 287   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 288   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 289   unsigned nbbs = loop->num_nodes;
 290   poly_uint64 vectorization_factor = 1;
 291   tree scalar_type = NULL_TREE;
 292   gphi *phi;
 293   tree vectype;
 294   stmt_vec_info stmt_info;
 295   unsigned i;
 296
 297   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 298
 299   for (i = 0; i < nbbs; i++)
 300     {
 301       basic_block bb = bbs[i];
 302
 303       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 304            gsi_next (&si))
 305         {
 306           phi = si.phi ();
 307           stmt_info = loop_vinfo->lookup_stmt (phi);
 308           if (dump_enabled_p ())
 309             dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
 310                              (gimple *) phi);
 311
 312           gcc_assert (stmt_info);
 313
 314           if (STMT_VINFO_RELEVANT_P (stmt_info)
 315               || STMT_VINFO_LIVE_P (stmt_info))
 316             {
 317               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 318               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 319
 320               if (dump_enabled_p ())
 321                 dump_printf_loc (MSG_NOTE, vect_location,
 322                                  "get vectype for scalar type:  %T\n",
 323                                  scalar_type);
 324
 325               vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 326               if (!vectype)
 327                 return opt_result::failure_at (phi,
 328                                                "not vectorized: unsupported "
 329                                                "data-type %T\n",
 330                                                scalar_type);
 331               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 332
 333               if (dump_enabled_p ())
 334                 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
 335                                  vectype);
 336
 337               if (dump_enabled_p ())
 338                 {
 339                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 340                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 341                   dump_printf (MSG_NOTE, "\n");
 342                 }
 343
 344               vect_update_max_nunits (&vectorization_factor, vectype);
 345             }
 346         }
 347
 348       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 349            gsi_next (&si))
 350         {
 351           if (is_gimple_debug (gsi_stmt (si)))
 352             continue;
 353           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 354           opt_result res
 355             = vect_determine_vf_for_stmt (loop_vinfo,
 356                                           stmt_info, &vectorization_factor);
 357           if (!res)
 358             return res;
 359         }
 360     }
 361
 362   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 363   if (dump_enabled_p ())
 364     {
 365       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 366       dump_dec (MSG_NOTE, vectorization_factor);
 367       dump_printf (MSG_NOTE, "\n");
 368     }
 369
 370   if (known_le (vectorization_factor, 1U))
 371     return opt_result::failure_at (vect_location,
 372                                    "not vectorized: unsupported data-type\n");
 373   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 374   return opt_result::success ();
 375 }
 376
 377
 378 /* Function vect_is_simple_iv_evolution.
 379
 380    FORNOW: A simple evolution of an induction variables in the loop is
 381    considered a polynomial evolution.  */
 382
 383 static bool
 384 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 385                              tree * step)
 386 {
 387   tree init_expr;
 388   tree step_expr;
 389   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 390   basic_block bb;
 391
 392   /* When there is no evolution in this loop, the evolution function
 393      is not "simple".  */
 394   if (evolution_part == NULL_TREE)
 395     return false;
 396
 397   /* When the evolution is a polynomial of degree >= 2
 398      the evolution function is not "simple".  */
 399   if (tree_is_chrec (evolution_part))
 400     return false;
 401
 402   step_expr = evolution_part;
 403   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 404
 405   if (dump_enabled_p ())
 406     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
 407                      step_expr, init_expr);
 408
 409   *init = init_expr;
 410   *step = step_expr;
 411
 412   if (TREE_CODE (step_expr) != INTEGER_CST
 413       && (TREE_CODE (step_expr) != SSA_NAME
 414           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 415               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 416           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 417               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 418                   || !flag_associative_math)))
 419       && (TREE_CODE (step_expr) != REAL_CST
 420           || !flag_associative_math))
 421     {
 422       if (dump_enabled_p ())
 423         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 424                          "step unknown.\n");
 425       return false;
 426     }
 427
 428   return true;
 429 }
 430
 431 /* Function vect_is_nonlinear_iv_evolution
 432
 433    Only support nonlinear induction for integer type
 434    1. neg
 435    2. mul by constant
 436    3. lshift/rshift by constant.
 437
 438    For neg induction, return a fake step as integer -1.  */
 439 static bool
 440 vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
 441                                 gphi* loop_phi_node, tree *init, tree *step)
 442 {
 443   tree init_expr, ev_expr, result, op1, op2;
 444   gimple* def;
 445
 446   if (gimple_phi_num_args (loop_phi_node) != 2)
 447     return false;
 448
 449   init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop));
 450   ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop));
 451
 452   /* Support nonlinear induction only for integer type.  */
 453   if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
 454     return false;
 455
 456   *init = init_expr;
 457   result = PHI_RESULT (loop_phi_node);
 458
 459   if (TREE_CODE (ev_expr) != SSA_NAME
 460       || ((def = SSA_NAME_DEF_STMT (ev_expr)), false)
 461       || !is_gimple_assign (def))
 462     return false;
 463
 464   enum tree_code t_code = gimple_assign_rhs_code (def);
 465   switch (t_code)
 466     {
 467     case NEGATE_EXPR:
 468       if (gimple_assign_rhs1 (def) != result)
 469         return false;
 470       *step = build_int_cst (TREE_TYPE (init_expr), -1);
 471       STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
 472       break;
 473
 474     case RSHIFT_EXPR:
 475     case LSHIFT_EXPR:
 476     case MULT_EXPR:
 477       op1 = gimple_assign_rhs1 (def);
 478       op2 = gimple_assign_rhs2 (def);
 479       if (TREE_CODE (op2) != INTEGER_CST
 480           || op1 != result)
 481         return false;
 482       *step = op2;
 483       if (t_code == LSHIFT_EXPR)
 484         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
 485       else if (t_code == RSHIFT_EXPR)
 486         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
 487       /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul.  */
 488       else
 489         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
 490       break;
 491
 492     default:
 493       return false;
 494     }
 495
 496   STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = *init;
 497   STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = *step;
 498
 499   return true;
 500 }
 501
 502 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
 503    what we are assuming is a double reduction.  For example, given
 504    a structure like this:
 505
 506       outer1:
 507         x_1 = PHI <x_4(outer2), ...>;
 508         ...
 509
 510       inner:
 511         x_2 = PHI <x_1(outer1), ...>;
 512         ...
 513         x_3 = ...;
 514         ...
 515
 516       outer2:
 517         x_4 = PHI <x_3(inner)>;
 518         ...
 519
 520    outer loop analysis would treat x_1 as a double reduction phi and
 521    this function would then return true for x_2.  */
 522
 523 static bool
 524 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
 525 {
 526   use_operand_p use_p;
 527   ssa_op_iter op_iter;
 528   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
 529     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
 530       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
 531         return true;
 532   return false;
 533 }
 534
 535 /* Returns true if Phi is a first-order recurrence. A first-order
 536    recurrence is a non-reduction recurrence relation in which the value of
 537    the recurrence in the current loop iteration equals a value defined in
 538    the previous iteration.  */
 539
 540 static bool
 541 vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop,
 542                                    gphi *phi)
 543 {
 544   /* A nested cycle isn't vectorizable as first order recurrence.  */
 545   if (LOOP_VINFO_LOOP (loop_vinfo) != loop)
 546     return false;
 547
 548   /* Ensure the loop latch definition is from within the loop.  */
 549   edge latch = loop_latch_edge (loop);
 550   tree ldef = PHI_ARG_DEF_FROM_EDGE (phi, latch);
 551   if (TREE_CODE (ldef) != SSA_NAME
 552       || SSA_NAME_IS_DEFAULT_DEF (ldef)
 553       || is_a <gphi *> (SSA_NAME_DEF_STMT (ldef))
 554       || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (ldef))))
 555     return false;
 556
 557   tree def = gimple_phi_result (phi);
 558
 559   /* Ensure every use_stmt of the phi node is dominated by the latch
 560      definition.  */
 561   imm_use_iterator imm_iter;
 562   use_operand_p use_p;
 563   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
 564     if (!is_gimple_debug (USE_STMT (use_p))
 565         && (SSA_NAME_DEF_STMT (ldef) == USE_STMT (use_p)
 566             || !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (ldef),
 567                                             USE_STMT (use_p))))
 568       return false;
 569
 570   /* First-order recurrence autovectorization needs shuffle vector.  */
 571   tree scalar_type = TREE_TYPE (def);
 572   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 573   if (!vectype)
 574     return false;
 575
 576   return true;
 577 }
 578
 579 /* Function vect_analyze_scalar_cycles_1.
 580
 581    Examine the cross iteration def-use cycles of scalar variables
 582    in LOOP.  LOOP_VINFO represents the loop that is now being
 583    considered for vectorization (can be LOOP, or an outer-loop
 584    enclosing LOOP).  SLP indicates there will be some subsequent
 585    slp analyses or not.  */
 586
 587 static void
 588 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop,
 589                               bool slp)
 590 {
 591   basic_block bb = loop->header;
 592   tree init, step;
 593   auto_vec<stmt_vec_info, 64> worklist;
 594   gphi_iterator gsi;
 595   bool double_reduc, reduc_chain;
 596
 597   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 598
 599   /* First - identify all inductions.  Reduction detection assumes that all the
 600      inductions have been identified, therefore, this order must not be
 601      changed.  */
 602   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 603     {
 604       gphi *phi = gsi.phi ();
 605       tree access_fn = NULL;
 606       tree def = PHI_RESULT (phi);
 607       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 608
 609       if (dump_enabled_p ())
 610         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
 611                          (gimple *) phi);
 612
 613       /* Skip virtual phi's.  The data dependences that are associated with
 614          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 615       if (virtual_operand_p (def))
 616         continue;
 617
 618       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 619
 620       /* Analyze the evolution function.  */
 621       access_fn = analyze_scalar_evolution (loop, def);
 622       if (access_fn)
 623         {
 624           STRIP_NOPS (access_fn);
 625           if (dump_enabled_p ())
 626             dump_printf_loc (MSG_NOTE, vect_location,
 627                              "Access function of PHI: %T\n", access_fn);
 628           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 629             = initial_condition_in_loop_num (access_fn, loop->num);
 630           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 631             = evolution_part_in_loop_num (access_fn, loop->num);
 632         }
 633
 634       if ((!access_fn
 635            || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
 636            || !vect_is_simple_iv_evolution (loop->num, access_fn,
 637                                             &init, &step)
 638            || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 639                && TREE_CODE (step) != INTEGER_CST))
 640           /* Only handle nonlinear iv for same loop.  */
 641           && (LOOP_VINFO_LOOP (loop_vinfo) != loop
 642               || !vect_is_nonlinear_iv_evolution (loop, stmt_vinfo,
 643                                                   phi, &init, &step)))
 644         {
 645           worklist.safe_push (stmt_vinfo);
 646           continue;
 647         }
 648
 649       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 650                   != NULL_TREE);
 651       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 652
 653       if (dump_enabled_p ())
 654         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 655       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 656     }
 657
 658
 659   /* Second - identify all reductions and nested cycles.  */
 660   while (worklist.length () > 0)
 661     {
 662       stmt_vec_info stmt_vinfo = worklist.pop ();
 663       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
 664       tree def = PHI_RESULT (phi);
 665
 666       if (dump_enabled_p ())
 667         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
 668                          (gimple *) phi);
 669
 670       gcc_assert (!virtual_operand_p (def)
 671                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 672
 673       stmt_vec_info reduc_stmt_info
 674         = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
 675                                     &reduc_chain, slp);
 676       if (reduc_stmt_info)
 677         {
 678           STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
 679           STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
 680           if (double_reduc)
 681             {
 682               if (dump_enabled_p ())
 683                 dump_printf_loc (MSG_NOTE, vect_location,
 684                                  "Detected double reduction.\n");
 685
 686               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 687               STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
 688             }
 689           else
 690             {
 691               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 692                 {
 693                   if (dump_enabled_p ())
 694                     dump_printf_loc (MSG_NOTE, vect_location,
 695                                      "Detected vectorizable nested cycle.\n");
 696
 697                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 698                 }
 699               else
 700                 {
 701                   if (dump_enabled_p ())
 702                     dump_printf_loc (MSG_NOTE, vect_location,
 703                                      "Detected reduction.\n");
 704
 705                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 706                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
 707                   /* Store the reduction cycles for possible vectorization in
 708                      loop-aware SLP if it was not detected as reduction
 709                      chain.  */
 710                   if (! reduc_chain)
 711                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
 712                       (reduc_stmt_info);
 713                 }
 714             }
 715         }
 716       else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi))
 717         STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence;
 718       else
 719         if (dump_enabled_p ())
 720           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 721                            "Unknown def-use cycle pattern.\n");
 722     }
 723 }
 724
 725
 726 /* Function vect_analyze_scalar_cycles.
 727
 728    Examine the cross iteration def-use cycles of scalar variables, by
 729    analyzing the loop-header PHIs of scalar variables.  Classify each
 730    cycle as one of the following: invariant, induction, reduction, unknown.
 731    We do that for the loop represented by LOOP_VINFO, and also to its
 732    inner-loop, if exists.
 733    Examples for scalar cycles:
 734
 735    Example1: reduction:
 736
 737               loop1:
 738               for (i=0; i<N; i++)
 739                  sum += a[i];
 740
 741    Example2: induction:
 742
 743               loop2:
 744               for (i=0; i<N; i++)
 745                  a[i] = i;  */
 746
 747 static void
 748 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo, bool slp)
 749 {
 750   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 751
 752   vect_analyze_scalar_cycles_1 (loop_vinfo, loop, slp);
 753
 754   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 755      Reductions in such inner-loop therefore have different properties than
 756      the reductions in the nest that gets vectorized:
 757      1. When vectorized, they are executed in the same order as in the original
 758         scalar loop, so we can't change the order of computation when
 759         vectorizing them.
 760      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 761         current checks are too strict.  */
 762
 763   if (loop->inner)
 764     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner, slp);
 765 }
 766
 767 /* Transfer group and reduction information from STMT_INFO to its
 768    pattern stmt.  */
 769
 770 static void
 771 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
 772 {
 773   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
 774   stmt_vec_info stmtp;
 775   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
 776               && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
 777   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
 778   do
 779     {
 780       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
 781       gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
 782                            == STMT_VINFO_DEF_TYPE (stmt_info));
 783       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
 784       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
 785       if (stmt_info)
 786         REDUC_GROUP_NEXT_ELEMENT (stmtp)
 787           = STMT_VINFO_RELATED_STMT (stmt_info);
 788     }
 789   while (stmt_info);
 790 }
 791
 792 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 793
 794 static void
 795 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 796 {
 797   stmt_vec_info first;
 798   unsigned i;
 799
 800   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 801     {
 802       stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
 803       while (next)
 804         {
 805           if ((STMT_VINFO_IN_PATTERN_P (next)
 806                != STMT_VINFO_IN_PATTERN_P (first))
 807               || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
 808             break;
 809           next = REDUC_GROUP_NEXT_ELEMENT (next);
 810         }
 811       /* If all reduction chain members are well-formed patterns adjust
 812          the group to group the pattern stmts instead.  */
 813       if (! next
 814           && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
 815         {
 816           if (STMT_VINFO_IN_PATTERN_P (first))
 817             {
 818               vect_fixup_reduc_chain (first);
 819               LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 820                 = STMT_VINFO_RELATED_STMT (first);
 821             }
 822         }
 823       /* If not all stmt in the chain are patterns or if we failed
 824          to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
 825          it as regular reduction instead.  */
 826       else
 827         {
 828           stmt_vec_info vinfo = first;
 829           stmt_vec_info last = NULL;
 830           while (vinfo)
 831             {
 832               next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
 833               REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
 834               REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
 835               last = vinfo;
 836               vinfo = next;
 837             }
 838           STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
 839             = vect_internal_def;
 840           loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
 841           LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
 842           --i;
 843         }
 844     }
 845 }
 846
 847 /* Function vect_get_loop_niters.
 848
 849    Determine how many iterations the loop is executed and place it
 850    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 851    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 852    niter information holds in ASSUMPTIONS.
 853
 854    Return the loop exit conditions.  */
 855
 856
 857 static vec<gcond *>
 858 vect_get_loop_niters (class loop *loop, const_edge main_exit, tree *assumptions,
 859                       tree *number_of_iterations, tree *number_of_iterationsm1)
 860 {
 861   auto_vec<edge> exits = get_loop_exit_edges (loop);
 862   vec<gcond *> conds;
 863   conds.create (exits.length ());
 864   class tree_niter_desc niter_desc;
 865   tree niter_assumptions, niter, may_be_zero;
 866
 867   *assumptions = boolean_true_node;
 868   *number_of_iterationsm1 = chrec_dont_know;
 869   *number_of_iterations = chrec_dont_know;
 870
 871   DUMP_VECT_SCOPE ("get_loop_niters");
 872
 873   if (exits.is_empty ())
 874     return conds;
 875
 876   if (dump_enabled_p ())
 877     dump_printf_loc (MSG_NOTE, vect_location, "Loop has %d exits.\n",
 878                      exits.length ());
 879
 880   edge exit;
 881   unsigned int i;
 882   FOR_EACH_VEC_ELT (exits, i, exit)
 883     {
 884       gcond *cond = get_loop_exit_condition (exit);
 885       if (cond)
 886         conds.safe_push (cond);
 887
 888       if (dump_enabled_p ())
 889         dump_printf_loc (MSG_NOTE, vect_location, "Analyzing exit %d...\n", i);
 890
 891       if (exit != main_exit)
 892         continue;
 893
 894       may_be_zero = NULL_TREE;
 895       if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 896           || chrec_contains_undetermined (niter_desc.niter))
 897         continue;
 898
 899       niter_assumptions = niter_desc.assumptions;
 900       may_be_zero = niter_desc.may_be_zero;
 901       niter = niter_desc.niter;
 902
 903       if (may_be_zero && integer_zerop (may_be_zero))
 904         may_be_zero = NULL_TREE;
 905
 906       if (may_be_zero)
 907         {
 908           if (COMPARISON_CLASS_P (may_be_zero))
 909             {
 910               /* Try to combine may_be_zero with assumptions, this can simplify
 911                  computation of niter expression.  */
 912               if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 913                 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 914                                                  niter_assumptions,
 915                                                  fold_build1 (TRUTH_NOT_EXPR,
 916                                                               boolean_type_node,
 917                                                               may_be_zero));
 918               else
 919                 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 920                                      build_int_cst (TREE_TYPE (niter), 0),
 921                                      rewrite_to_non_trapping_overflow (niter));
 922
 923               may_be_zero = NULL_TREE;
 924             }
 925           else if (integer_nonzerop (may_be_zero))
 926             {
 927               *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 928               *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 929               continue;
 930             }
 931           else
 932             continue;
 933        }
 934
 935       /* Loop assumptions are based off the normal exit.  */
 936       *assumptions = niter_assumptions;
 937       *number_of_iterationsm1 = niter;
 938
 939       /* We want the number of loop header executions which is the number
 940          of latch executions plus one.
 941          ???  For UINT_MAX latch executions this number overflows to zero
 942          for loops like do { n++; } while (n != 0);  */
 943       if (niter && !chrec_contains_undetermined (niter))
 944           niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter),
 945                                unshare_expr (niter),
 946                                build_int_cst (TREE_TYPE (niter), 1));
 947       *number_of_iterations = niter;
 948     }
 949
 950   if (dump_enabled_p ())
 951     dump_printf_loc (MSG_NOTE, vect_location, "All loop exits successfully analyzed.\n");
 952
 953   return conds;
 954 }
 955
 956 /*  Determine the main loop exit for the vectorizer.  */
 957
 958 edge
 959 vec_init_loop_exit_info (class loop *loop)
 960 {
 961   /* Before we begin we must first determine which exit is the main one and
 962      which are auxilary exits.  */
 963   auto_vec<edge> exits = get_loop_exit_edges (loop);
 964   if (exits.length () == 1)
 965     return exits[0];
 966
 967   /* If we have multiple exits we only support counting IV at the moment.  Analyze
 968      all exits and return one */
 969   class tree_niter_desc niter_desc;
 970   edge candidate = NULL;
 971   for (edge exit : exits)
 972     {
 973       if (!get_loop_exit_condition (exit))
 974         continue;
 975
 976       if (number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 977           && !chrec_contains_undetermined (niter_desc.niter))
 978         {
 979           if (!niter_desc.may_be_zero || !candidate)
 980             candidate = exit;
 981         }
 982     }
 983
 984   return candidate;
 985 }
 986
 987 /* Function bb_in_loop_p
 988
 989    Used as predicate for dfs order traversal of the loop bbs.  */
 990
 991 static bool
 992 bb_in_loop_p (const_basic_block bb, const void *data)
 993 {
 994   const class loop *const loop = (const class loop *)data;
 995   if (flow_bb_inside_loop_p (loop, bb))
 996     return true;
 997   return false;
 998 }
 999
1000
1001 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1002    stmt_vec_info structs for all the stmts in LOOP_IN.  */
1003
1004 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
1005   : vec_info (vec_info::loop, shared),
1006     loop (loop_in),
1007     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
1008     num_itersm1 (NULL_TREE),
1009     num_iters (NULL_TREE),
1010     num_iters_unchanged (NULL_TREE),
1011     num_iters_assumptions (NULL_TREE),
1012     vector_costs (nullptr),
1013     scalar_costs (nullptr),
1014     th (0),
1015     versioning_threshold (0),
1016     vectorization_factor (0),
1017     main_loop_edge (nullptr),
1018     skip_main_loop_edge (nullptr),
1019     skip_this_loop_edge (nullptr),
1020     reusable_accumulators (),
1021     suggested_unroll_factor (1),
1022     max_vectorization_factor (0),
1023     mask_skip_niters (NULL_TREE),
1024     rgroup_compare_type (NULL_TREE),
1025     simd_if_cond (NULL_TREE),
1026     partial_vector_style (vect_partial_vectors_none),
1027     unaligned_dr (NULL),
1028     peeling_for_alignment (0),
1029     ptr_mask (0),
1030     ivexpr_map (NULL),
1031     scan_map (NULL),
1032     slp_unrolling_factor (1),
1033     inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
1034     vectorizable (false),
1035     can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
1036     using_partial_vectors_p (false),
1037     using_decrementing_iv_p (false),
1038     using_select_vl_p (false),
1039     epil_using_partial_vectors_p (false),
1040     partial_load_store_bias (0),
1041     peeling_for_gaps (false),
1042     peeling_for_niter (false),
1043     no_data_dependencies (false),
1044     has_mask_store (false),
1045     scalar_loop_scaling (profile_probability::uninitialized ()),
1046     scalar_loop (NULL),
1047     orig_loop_info (NULL),
1048     vec_loop_iv_exit (NULL),
1049     vec_epilogue_loop_iv_exit (NULL),
1050     scalar_loop_iv_exit (NULL)
1051 {
1052   /* CHECKME: We want to visit all BBs before their successors (except for
1053      latch blocks, for which this assertion wouldn't hold).  In the simple
1054      case of the loop forms we allow, a dfs order of the BBs would the same
1055      as reversed postorder traversal, so we are safe.  */
1056
1057   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1058                                           bbs, loop->num_nodes, loop);
1059   gcc_assert (nbbs == loop->num_nodes);
1060
1061   for (unsigned int i = 0; i < nbbs; i++)
1062     {
1063       basic_block bb = bbs[i];
1064       gimple_stmt_iterator si;
1065
1066       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1067         {
1068           gimple *phi = gsi_stmt (si);
1069           gimple_set_uid (phi, 0);
1070           add_stmt (phi);
1071         }
1072
1073       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1074         {
1075           gimple *stmt = gsi_stmt (si);
1076           gimple_set_uid (stmt, 0);
1077           if (is_gimple_debug (stmt))
1078             continue;
1079           add_stmt (stmt);
1080           /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
1081              third argument is the #pragma omp simd if (x) condition, when 0,
1082              loop shouldn't be vectorized, when non-zero constant, it should
1083              be vectorized normally, otherwise versioned with vectorized loop
1084              done if the condition is non-zero at runtime.  */
1085           if (loop_in->simduid
1086               && is_gimple_call (stmt)
1087               && gimple_call_internal_p (stmt)
1088               && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
1089               && gimple_call_num_args (stmt) >= 3
1090               && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
1091               && (loop_in->simduid
1092                   == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
1093             {
1094               tree arg = gimple_call_arg (stmt, 2);
1095               if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
1096                 simd_if_cond = arg;
1097               else
1098                 gcc_assert (integer_nonzerop (arg));
1099             }
1100         }
1101     }
1102
1103   epilogue_vinfos.create (6);
1104 }
1105
1106 /* Free all levels of rgroup CONTROLS.  */
1107
1108 void
1109 release_vec_loop_controls (vec<rgroup_controls> *controls)
1110 {
1111   rgroup_controls *rgc;
1112   unsigned int i;
1113   FOR_EACH_VEC_ELT (*controls, i, rgc)
1114     rgc->controls.release ();
1115   controls->release ();
1116 }
1117
1118 /* Free all memory used by the _loop_vec_info, as well as all the
1119    stmt_vec_info structs of all the stmts in the loop.  */
1120
1121 _loop_vec_info::~_loop_vec_info ()
1122 {
1123   free (bbs);
1124
1125   release_vec_loop_controls (&masks.rgc_vec);
1126   release_vec_loop_controls (&lens);
1127   delete ivexpr_map;
1128   delete scan_map;
1129   epilogue_vinfos.release ();
1130   delete scalar_costs;
1131   delete vector_costs;
1132
1133   /* When we release an epiloge vinfo that we do not intend to use
1134      avoid clearing AUX of the main loop which should continue to
1135      point to the main loop vinfo since otherwise we'll leak that.  */
1136   if (loop->aux == this)
1137     loop->aux = NULL;
1138 }
1139
1140 /* Return an invariant or register for EXPR and emit necessary
1141    computations in the LOOP_VINFO loop preheader.  */
1142
1143 tree
1144 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
1145 {
1146   if (is_gimple_reg (expr)
1147       || is_gimple_min_invariant (expr))
1148     return expr;
1149
1150   if (! loop_vinfo->ivexpr_map)
1151     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
1152   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
1153   if (! cached)
1154     {
1155       gimple_seq stmts = NULL;
1156       cached = force_gimple_operand (unshare_expr (expr),
1157                                      &stmts, true, NULL_TREE);
1158       if (stmts)
1159         {
1160           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1161           gsi_insert_seq_on_edge_immediate (e, stmts);
1162         }
1163     }
1164   return cached;
1165 }
1166
1167 /* Return true if we can use CMP_TYPE as the comparison type to produce
1168    all masks required to mask LOOP_VINFO.  */
1169
1170 static bool
1171 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1172 {
1173   rgroup_controls *rgm;
1174   unsigned int i;
1175   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1176     if (rgm->type != NULL_TREE
1177         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1178                                             cmp_type, rgm->type,
1179                                             OPTIMIZE_FOR_SPEED))
1180       return false;
1181   return true;
1182 }
1183
1184 /* Calculate the maximum number of scalars per iteration for every
1185    rgroup in LOOP_VINFO.  */
1186
1187 static unsigned int
1188 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1189 {
1190   unsigned int res = 1;
1191   unsigned int i;
1192   rgroup_controls *rgm;
1193   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1194     res = MAX (res, rgm->max_nscalars_per_iter);
1195   return res;
1196 }
1197
1198 /* Calculate the minimum precision necessary to represent:
1199
1200       MAX_NITERS * FACTOR
1201
1202    as an unsigned integer, where MAX_NITERS is the maximum number of
1203    loop header iterations for the original scalar form of LOOP_VINFO.  */
1204
1205 static unsigned
1206 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1207 {
1208   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1209
1210   /* Get the maximum number of iterations that is representable
1211      in the counter type.  */
1212   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1213   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1214
1215   /* Get a more refined estimate for the number of iterations.  */
1216   widest_int max_back_edges;
1217   if (max_loop_iterations (loop, &max_back_edges))
1218     max_ni = wi::smin (max_ni, max_back_edges + 1);
1219
1220   /* Work out how many bits we need to represent the limit.  */
1221   return wi::min_precision (max_ni * factor, UNSIGNED);
1222 }
1223
1224 /* True if the loop needs peeling or partial vectors when vectorized.  */
1225
1226 static bool
1227 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1228 {
1229   unsigned HOST_WIDE_INT const_vf;
1230   HOST_WIDE_INT max_niter
1231     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1232
1233   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1234   if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1235     th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1236                                           (loop_vinfo));
1237
1238   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1239       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1240     {
1241       /* Work out the (constant) number of iterations that need to be
1242          peeled for reasons other than niters.  */
1243       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1244       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1245         peel_niter += 1;
1246       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1247                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1248         return true;
1249     }
1250   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1251       /* ??? When peeling for gaps but not alignment, we could
1252          try to check whether the (variable) niters is known to be
1253          VF * N + 1.  That's something of a niche case though.  */
1254       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1255       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1256       || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1257            < (unsigned) exact_log2 (const_vf))
1258           /* In case of versioning, check if the maximum number of
1259              iterations is greater than th.  If they are identical,
1260              the epilogue is unnecessary.  */
1261           && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1262               || ((unsigned HOST_WIDE_INT) max_niter
1263                   > (th / const_vf) * const_vf))))
1264     return true;
1265
1266   return false;
1267 }
1268
1269 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1270    whether we can actually generate the masks required.  Return true if so,
1271    storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE.  */
1272
1273 static bool
1274 vect_verify_full_masking (loop_vec_info loop_vinfo)
1275 {
1276   unsigned int min_ni_width;
1277
1278   /* Use a normal loop if there are no statements that need masking.
1279      This only happens in rare degenerate cases: it means that the loop
1280      has no loads, no stores, and no live-out values.  */
1281   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1282     return false;
1283
1284   /* Produce the rgroup controls.  */
1285   for (auto mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1286     {
1287       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1288       tree vectype = mask.first;
1289       unsigned nvectors = mask.second;
1290
1291       if (masks->rgc_vec.length () < nvectors)
1292         masks->rgc_vec.safe_grow_cleared (nvectors, true);
1293       rgroup_controls *rgm = &(*masks).rgc_vec[nvectors - 1];
1294       /* The number of scalars per iteration and the number of vectors are
1295          both compile-time constants.  */
1296       unsigned int nscalars_per_iter
1297           = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1298                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1299
1300       if (rgm->max_nscalars_per_iter < nscalars_per_iter)
1301         {
1302           rgm->max_nscalars_per_iter = nscalars_per_iter;
1303           rgm->type = truth_type_for (vectype);
1304           rgm->factor = 1;
1305         }
1306     }
1307
1308   unsigned int max_nscalars_per_iter
1309     = vect_get_max_nscalars_per_iter (loop_vinfo);
1310
1311   /* Work out how many bits we need to represent the limit.  */
1312   min_ni_width
1313     = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1314
1315   /* Find a scalar mode for which WHILE_ULT is supported.  */
1316   opt_scalar_int_mode cmp_mode_iter;
1317   tree cmp_type = NULL_TREE;
1318   tree iv_type = NULL_TREE;
1319   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1320   unsigned int iv_precision = UINT_MAX;
1321
1322   if (iv_limit != -1)
1323     iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1324                                       UNSIGNED);
1325
1326   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1327     {
1328       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1329       if (cmp_bits >= min_ni_width
1330           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1331         {
1332           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1333           if (this_type
1334               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1335             {
1336               /* Although we could stop as soon as we find a valid mode,
1337                  there are at least two reasons why that's not always the
1338                  best choice:
1339
1340                  - An IV that's Pmode or wider is more likely to be reusable
1341                    in address calculations than an IV that's narrower than
1342                    Pmode.
1343
1344                  - Doing the comparison in IV_PRECISION or wider allows
1345                    a natural 0-based IV, whereas using a narrower comparison
1346                    type requires mitigations against wrap-around.
1347
1348                  Conversely, if the IV limit is variable, doing the comparison
1349                  in a wider type than the original type can introduce
1350                  unnecessary extensions, so picking the widest valid mode
1351                  is not always a good choice either.
1352
1353                  Here we prefer the first IV type that's Pmode or wider,
1354                  and the first comparison type that's IV_PRECISION or wider.
1355                  (The comparison type must be no wider than the IV type,
1356                  to avoid extensions in the vector loop.)
1357
1358                  ??? We might want to try continuing beyond Pmode for ILP32
1359                  targets if CMP_BITS < IV_PRECISION.  */
1360               iv_type = this_type;
1361               if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1362                 cmp_type = this_type;
1363               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1364                 break;
1365             }
1366         }
1367     }
1368
1369   if (!cmp_type)
1370     {
1371       LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.release ();
1372       return false;
1373     }
1374
1375   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1376   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1377   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_while_ult;
1378   return true;
1379 }
1380
1381 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1382    whether we can actually generate AVX512 style masks.  Return true if so,
1383    storing the type of the scalar IV in LOOP_VINFO_RGROUP_IV_TYPE.  */
1384
1385 static bool
1386 vect_verify_full_masking_avx512 (loop_vec_info loop_vinfo)
1387 {
1388   /* Produce differently organized rgc_vec and differently check
1389      we can produce masks.  */
1390
1391   /* Use a normal loop if there are no statements that need masking.
1392      This only happens in rare degenerate cases: it means that the loop
1393      has no loads, no stores, and no live-out values.  */
1394   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1395     return false;
1396
1397   /* For the decrementing IV we need to represent all values in
1398      [0, niter + niter_skip] where niter_skip is the elements we
1399      skip in the first iteration for prologue peeling.  */
1400   tree iv_type = NULL_TREE;
1401   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1402   unsigned int iv_precision = UINT_MAX;
1403   if (iv_limit != -1)
1404     iv_precision = wi::min_precision (iv_limit, UNSIGNED);
1405
1406   /* First compute the type for the IV we use to track the remaining
1407      scalar iterations.  */
1408   opt_scalar_int_mode cmp_mode_iter;
1409   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1410     {
1411       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1412       if (cmp_bits >= iv_precision
1413           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1414         {
1415           iv_type = build_nonstandard_integer_type (cmp_bits, true);
1416           if (iv_type)
1417             break;
1418         }
1419     }
1420   if (!iv_type)
1421     return false;
1422
1423   /* Produce the rgroup controls.  */
1424   for (auto const &mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1425     {
1426       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1427       tree vectype = mask.first;
1428       unsigned nvectors = mask.second;
1429
1430       /* The number of scalars per iteration and the number of vectors are
1431          both compile-time constants.  */
1432       unsigned int nscalars_per_iter
1433         = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1434                      LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1435
1436       /* We index the rgroup_controls vector with nscalars_per_iter
1437          which we keep constant and instead have a varying nvectors,
1438          remembering the vector mask with the fewest nV.  */
1439       if (masks->rgc_vec.length () < nscalars_per_iter)
1440         masks->rgc_vec.safe_grow_cleared (nscalars_per_iter, true);
1441       rgroup_controls *rgm = &(*masks).rgc_vec[nscalars_per_iter - 1];
1442
1443       if (!rgm->type || rgm->factor > nvectors)
1444         {
1445           rgm->type = truth_type_for (vectype);
1446           rgm->compare_type = NULL_TREE;
1447           rgm->max_nscalars_per_iter = nscalars_per_iter;
1448           rgm->factor = nvectors;
1449           rgm->bias_adjusted_ctrl = NULL_TREE;
1450         }
1451     }
1452
1453   /* There is no fixed compare type we are going to use but we have to
1454      be able to get at one for each mask group.  */
1455   unsigned int min_ni_width
1456     = wi::min_precision (vect_max_vf (loop_vinfo), UNSIGNED);
1457
1458   bool ok = true;
1459   for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
1460     {
1461       tree mask_type = rgc.type;
1462       if (!mask_type)
1463         continue;
1464
1465       if (TYPE_PRECISION (TREE_TYPE (mask_type)) != 1)
1466         {
1467           ok = false;
1468           break;
1469         }
1470
1471       /* If iv_type is usable as compare type use that - we can elide the
1472          saturation in that case.   */
1473       if (TYPE_PRECISION (iv_type) >= min_ni_width)
1474         {
1475           tree cmp_vectype
1476             = build_vector_type (iv_type, TYPE_VECTOR_SUBPARTS (mask_type));
1477           if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1478             rgc.compare_type = cmp_vectype;
1479         }
1480       if (!rgc.compare_type)
1481         FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1482           {
1483             unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1484             if (cmp_bits >= min_ni_width
1485                 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1486               {
1487                 tree cmp_type = build_nonstandard_integer_type (cmp_bits, true);
1488                 if (!cmp_type)
1489                   continue;
1490
1491                 /* Check whether we can produce the mask with cmp_type.  */
1492                 tree cmp_vectype
1493                   = build_vector_type (cmp_type, TYPE_VECTOR_SUBPARTS (mask_type));
1494                 if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1495                   {
1496                     rgc.compare_type = cmp_vectype;
1497                     break;
1498                   }
1499               }
1500         }
1501       if (!rgc.compare_type)
1502         {
1503           ok = false;
1504           break;
1505         }
1506     }
1507   if (!ok)
1508     {
1509       release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
1510       return false;
1511     }
1512
1513   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = error_mark_node;
1514   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1515   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_avx512;
1516   return true;
1517 }
1518
1519 /* Check whether we can use vector access with length based on precison
1520    comparison.  So far, to keep it simple, we only allow the case that the
1521    precision of the target supported length is larger than the precision
1522    required by loop niters.  */
1523
1524 static bool
1525 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1526 {
1527   if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1528     return false;
1529
1530   machine_mode len_load_mode, len_store_mode;
1531   if (!get_len_load_store_mode (loop_vinfo->vector_mode, true)
1532          .exists (&len_load_mode))
1533     return false;
1534   if (!get_len_load_store_mode (loop_vinfo->vector_mode, false)
1535          .exists (&len_store_mode))
1536     return false;
1537
1538   signed char partial_load_bias = internal_len_load_store_bias
1539     (IFN_LEN_LOAD, len_load_mode);
1540
1541   signed char partial_store_bias = internal_len_load_store_bias
1542     (IFN_LEN_STORE, len_store_mode);
1543
1544   gcc_assert (partial_load_bias == partial_store_bias);
1545
1546   if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1547     return false;
1548
1549   /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1550      len_loads with a length of zero.  In order to avoid that we prohibit
1551      more than one loop length here.  */
1552   if (partial_load_bias == -1
1553       && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1554     return false;
1555
1556   LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1557
1558   unsigned int max_nitems_per_iter = 1;
1559   unsigned int i;
1560   rgroup_controls *rgl;
1561   /* Find the maximum number of items per iteration for every rgroup.  */
1562   FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1563     {
1564       unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1565       max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1566     }
1567
1568   /* Work out how many bits we need to represent the length limit.  */
1569   unsigned int min_ni_prec
1570     = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1571
1572   /* Now use the maximum of below precisions for one suitable IV type:
1573      - the IV's natural precision
1574      - the precision needed to hold: the maximum number of scalar
1575        iterations multiplied by the scale factor (min_ni_prec above)
1576      - the Pmode precision
1577
1578      If min_ni_prec is less than the precision of the current niters,
1579      we perfer to still use the niters type.  Prefer to use Pmode and
1580      wider IV to avoid narrow conversions.  */
1581
1582   unsigned int ni_prec
1583     = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1584   min_ni_prec = MAX (min_ni_prec, ni_prec);
1585   min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1586
1587   tree iv_type = NULL_TREE;
1588   opt_scalar_int_mode tmode_iter;
1589   FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1590     {
1591       scalar_mode tmode = tmode_iter.require ();
1592       unsigned int tbits = GET_MODE_BITSIZE (tmode);
1593
1594       /* ??? Do we really want to construct one IV whose precision exceeds
1595          BITS_PER_WORD?  */
1596       if (tbits > BITS_PER_WORD)
1597         break;
1598
1599       /* Find the first available standard integral type.  */
1600       if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1601         {
1602           iv_type = build_nonstandard_integer_type (tbits, true);
1603           break;
1604         }
1605     }
1606
1607   if (!iv_type)
1608     {
1609       if (dump_enabled_p ())
1610         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1611                          "can't vectorize with length-based partial vectors"
1612                          " because there is no suitable iv type.\n");
1613       return false;
1614     }
1615
1616   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1617   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1618   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_len;
1619
1620   return true;
1621 }
1622
1623 /* Calculate the cost of one scalar iteration of the loop.  */
1624 static void
1625 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1626 {
1627   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1628   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1629   int nbbs = loop->num_nodes, factor;
1630   int innerloop_iters, i;
1631
1632   DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1633
1634   /* Gather costs for statements in the scalar loop.  */
1635
1636   /* FORNOW.  */
1637   innerloop_iters = 1;
1638   if (loop->inner)
1639     innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1640
1641   for (i = 0; i < nbbs; i++)
1642     {
1643       gimple_stmt_iterator si;
1644       basic_block bb = bbs[i];
1645
1646       if (bb->loop_father == loop->inner)
1647         factor = innerloop_iters;
1648       else
1649         factor = 1;
1650
1651       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1652         {
1653           gimple *stmt = gsi_stmt (si);
1654           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1655
1656           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1657             continue;
1658
1659           /* Skip stmts that are not vectorized inside the loop.  */
1660           stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1661           if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1662               && (!STMT_VINFO_LIVE_P (vstmt_info)
1663                   || !VECTORIZABLE_CYCLE_DEF
1664                         (STMT_VINFO_DEF_TYPE (vstmt_info))))
1665             continue;
1666
1667           vect_cost_for_stmt kind;
1668           if (STMT_VINFO_DATA_REF (stmt_info))
1669             {
1670               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1671                kind = scalar_load;
1672              else
1673                kind = scalar_store;
1674             }
1675           else if (vect_nop_conversion_p (stmt_info))
1676             continue;
1677           else
1678             kind = scalar_stmt;
1679
1680           /* We are using vect_prologue here to avoid scaling twice
1681              by the inner loop factor.  */
1682           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1683                             factor, kind, stmt_info, 0, vect_prologue);
1684         }
1685     }
1686
1687   /* Now accumulate cost.  */
1688   loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1689   add_stmt_costs (loop_vinfo->scalar_costs,
1690                   &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1691   loop_vinfo->scalar_costs->finish_cost (nullptr);
1692 }
1693
1694
1695 /* Function vect_analyze_loop_form.
1696
1697    Verify that certain CFG restrictions hold, including:
1698    - the loop has a pre-header
1699    - the loop has a single entry and exit
1700    - the loop exit condition is simple enough
1701    - the number of iterations can be analyzed, i.e, a countable loop.  The
1702      niter could be analyzed under some assumptions.  */
1703
1704 opt_result
1705 vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info)
1706 {
1707   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1708
1709   edge exit_e = vec_init_loop_exit_info (loop);
1710   if (!exit_e)
1711     return opt_result::failure_at (vect_location,
1712                                    "not vectorized:"
1713                                    " could not determine main exit from"
1714                                    " loop with multiple exits.\n");
1715   info->loop_exit = exit_e;
1716   if (dump_enabled_p ())
1717       dump_printf_loc (MSG_NOTE, vect_location,
1718                        "using as main loop exit: %d -> %d [AUX: %p]\n",
1719                        exit_e->src->index, exit_e->dest->index, exit_e->aux);
1720
1721   /* Different restrictions apply when we are considering an inner-most loop,
1722      vs. an outer (nested) loop.
1723      (FORNOW. May want to relax some of these restrictions in the future).  */
1724
1725   info->inner_loop_cond = NULL;
1726   if (!loop->inner)
1727     {
1728       /* Inner-most loop.  We currently require that the number of BBs is
1729          exactly 2 (the header and latch).  Vectorizable inner-most loops
1730          look like this:
1731
1732                         (pre-header)
1733                            |
1734                           header <--------+
1735                            | |            |
1736                            | +--> latch --+
1737                            |
1738                         (exit-bb)  */
1739
1740       if (loop->num_nodes != 2)
1741         return opt_result::failure_at (vect_location,
1742                                        "not vectorized:"
1743                                        " control flow in loop.\n");
1744
1745       if (empty_block_p (loop->header))
1746         return opt_result::failure_at (vect_location,
1747                                        "not vectorized: empty loop.\n");
1748     }
1749   else
1750     {
1751       class loop *innerloop = loop->inner;
1752       edge entryedge;
1753
1754       /* Nested loop. We currently require that the loop is doubly-nested,
1755          contains a single inner loop, and the number of BBs is exactly 5.
1756          Vectorizable outer-loops look like this:
1757
1758                         (pre-header)
1759                            |
1760                           header <---+
1761                            |         |
1762                           inner-loop |
1763                            |         |
1764                           tail ------+
1765                            |
1766                         (exit-bb)
1767
1768          The inner-loop has the properties expected of inner-most loops
1769          as described above.  */
1770
1771       if ((loop->inner)->inner || (loop->inner)->next)
1772         return opt_result::failure_at (vect_location,
1773                                        "not vectorized:"
1774                                        " multiple nested loops.\n");
1775
1776       if (loop->num_nodes != 5)
1777         return opt_result::failure_at (vect_location,
1778                                        "not vectorized:"
1779                                        " control flow in loop.\n");
1780
1781       entryedge = loop_preheader_edge (innerloop);
1782       if (entryedge->src != loop->header
1783           || !single_exit (innerloop)
1784           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1785         return opt_result::failure_at (vect_location,
1786                                        "not vectorized:"
1787                                        " unsupported outerloop form.\n");
1788
1789       /* Analyze the inner-loop.  */
1790       vect_loop_form_info inner;
1791       opt_result res = vect_analyze_loop_form (loop->inner, &inner);
1792       if (!res)
1793         {
1794           if (dump_enabled_p ())
1795             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1796                              "not vectorized: Bad inner loop.\n");
1797           return res;
1798         }
1799
1800       /* Don't support analyzing niter under assumptions for inner
1801          loop.  */
1802       if (!integer_onep (inner.assumptions))
1803         return opt_result::failure_at (vect_location,
1804                                        "not vectorized: Bad inner loop.\n");
1805
1806       if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1807         return opt_result::failure_at (vect_location,
1808                                        "not vectorized: inner-loop count not"
1809                                        " invariant.\n");
1810
1811       if (dump_enabled_p ())
1812         dump_printf_loc (MSG_NOTE, vect_location,
1813                          "Considering outer-loop vectorization.\n");
1814       info->inner_loop_cond = inner.conds[0];
1815     }
1816
1817   if (!single_exit (loop))
1818     return opt_result::failure_at (vect_location,
1819                                    "not vectorized: multiple exits.\n");
1820   if (EDGE_COUNT (loop->header->preds) != 2)
1821     return opt_result::failure_at (vect_location,
1822                                    "not vectorized:"
1823                                    " too many incoming edges.\n");
1824
1825   /* We assume that the loop exit condition is at the end of the loop. i.e,
1826      that the loop is represented as a do-while (with a proper if-guard
1827      before the loop if needed), where the loop header contains all the
1828      executable statements, and the latch is empty.  */
1829   if (!empty_block_p (loop->latch)
1830       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1831     return opt_result::failure_at (vect_location,
1832                                    "not vectorized: latch block not empty.\n");
1833
1834   /* Make sure the exit is not abnormal.  */
1835   if (exit_e->flags & EDGE_ABNORMAL)
1836     return opt_result::failure_at (vect_location,
1837                                    "not vectorized:"
1838                                    " abnormal loop exit edge.\n");
1839
1840   info->conds
1841     = vect_get_loop_niters (loop, exit_e, &info->assumptions,
1842                             &info->number_of_iterations,
1843                             &info->number_of_iterationsm1);
1844
1845   if (info->conds.is_empty ())
1846     return opt_result::failure_at
1847       (vect_location,
1848        "not vectorized: complicated exit condition.\n");
1849
1850   /* Determine what the primary and alternate exit conds are.  */
1851   for (unsigned i = 0; i < info->conds.length (); i++)
1852     {
1853       gcond *cond = info->conds[i];
1854       if (exit_e->src == gimple_bb (cond))
1855         std::swap (info->conds[0], info->conds[i]);
1856     }
1857
1858   if (integer_zerop (info->assumptions)
1859       || !info->number_of_iterations
1860       || chrec_contains_undetermined (info->number_of_iterations))
1861     return opt_result::failure_at
1862       (info->conds[0],
1863        "not vectorized: number of iterations cannot be computed.\n");
1864
1865   if (integer_zerop (info->number_of_iterations))
1866     return opt_result::failure_at
1867       (info->conds[0],
1868        "not vectorized: number of iterations = 0.\n");
1869
1870   if (!(tree_fits_shwi_p (info->number_of_iterations)
1871         && tree_to_shwi (info->number_of_iterations) > 0))
1872     {
1873       if (dump_enabled_p ())
1874         {
1875           dump_printf_loc (MSG_NOTE, vect_location,
1876                            "Symbolic number of iterations is ");
1877           dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1878           dump_printf (MSG_NOTE, "\n");
1879         }
1880     }
1881
1882   return opt_result::success ();
1883 }
1884
1885 /* Create a loop_vec_info for LOOP with SHARED and the
1886    vect_analyze_loop_form result.  */
1887
1888 loop_vec_info
1889 vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1890                         const vect_loop_form_info *info,
1891                         loop_vec_info main_loop_info)
1892 {
1893   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1894   LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1895   LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1896   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1897   LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = main_loop_info;
1898   /* Also record the assumptions for versioning.  */
1899   if (!integer_onep (info->assumptions) && !main_loop_info)
1900     LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1901
1902   for (gcond *cond : info->conds)
1903     {
1904       stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (cond);
1905       STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1906     }
1907
1908   for (unsigned i = 1; i < info->conds.length (); i ++)
1909     LOOP_VINFO_LOOP_CONDS (loop_vinfo).safe_push (info->conds[i]);
1910   LOOP_VINFO_LOOP_IV_COND (loop_vinfo) = info->conds[0];
1911
1912   LOOP_VINFO_IV_EXIT (loop_vinfo) = info->loop_exit;
1913
1914   if (info->inner_loop_cond)
1915     {
1916       stmt_vec_info inner_loop_cond_info
1917         = loop_vinfo->lookup_stmt (info->inner_loop_cond);
1918       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1919       /* If we have an estimate on the number of iterations of the inner
1920          loop use that to limit the scale for costing, otherwise use
1921          --param vect-inner-loop-cost-factor literally.  */
1922       widest_int nit;
1923       if (estimated_stmt_executions (loop->inner, &nit))
1924         LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1925           = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1926     }
1927
1928   return loop_vinfo;
1929 }
1930
1931
1932
1933 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1934    statements update the vectorization factor.  */
1935
1936 static void
1937 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1938 {
1939   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1940   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1941   int nbbs = loop->num_nodes;
1942   poly_uint64 vectorization_factor;
1943   int i;
1944
1945   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1946
1947   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1948   gcc_assert (known_ne (vectorization_factor, 0U));
1949
1950   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1951      vectorization factor of the loop is the unrolling factor required by
1952      the SLP instances.  If that unrolling factor is 1, we say, that we
1953      perform pure SLP on loop - cross iteration parallelism is not
1954      exploited.  */
1955   bool only_slp_in_loop = true;
1956   for (i = 0; i < nbbs; i++)
1957     {
1958       basic_block bb = bbs[i];
1959       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1960            gsi_next (&si))
1961         {
1962           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1963           if (!stmt_info)
1964             continue;
1965           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1966                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1967               && !PURE_SLP_STMT (stmt_info))
1968             /* STMT needs both SLP and loop-based vectorization.  */
1969             only_slp_in_loop = false;
1970         }
1971       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1972            gsi_next (&si))
1973         {
1974           if (is_gimple_debug (gsi_stmt (si)))
1975             continue;
1976           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1977           stmt_info = vect_stmt_to_vectorize (stmt_info);
1978           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1979                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1980               && !PURE_SLP_STMT (stmt_info))
1981             /* STMT needs both SLP and loop-based vectorization.  */
1982             only_slp_in_loop = false;
1983         }
1984     }
1985
1986   if (only_slp_in_loop)
1987     {
1988       if (dump_enabled_p ())
1989         dump_printf_loc (MSG_NOTE, vect_location,
1990                          "Loop contains only SLP stmts\n");
1991       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1992     }
1993   else
1994     {
1995       if (dump_enabled_p ())
1996         dump_printf_loc (MSG_NOTE, vect_location,
1997                          "Loop contains SLP and non-SLP stmts\n");
1998       /* Both the vectorization factor and unroll factor have the form
1999          GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
2000          so they must have a common multiple.  */
2001       vectorization_factor
2002         = force_common_multiple (vectorization_factor,
2003                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
2004     }
2005
2006   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
2007   if (dump_enabled_p ())
2008     {
2009       dump_printf_loc (MSG_NOTE, vect_location,
2010                        "Updating vectorization factor to ");
2011       dump_dec (MSG_NOTE, vectorization_factor);
2012       dump_printf (MSG_NOTE, ".\n");
2013     }
2014 }
2015
2016 /* Return true if STMT_INFO describes a double reduction phi and if
2017    the other phi in the reduction is also relevant for vectorization.
2018    This rejects cases such as:
2019
2020       outer1:
2021         x_1 = PHI <x_3(outer2), ...>;
2022         ...
2023
2024       inner:
2025         x_2 = ...;
2026         ...
2027
2028       outer2:
2029         x_3 = PHI <x_2(inner)>;
2030
2031    if nothing in x_2 or elsewhere makes x_1 relevant.  */
2032
2033 static bool
2034 vect_active_double_reduction_p (stmt_vec_info stmt_info)
2035 {
2036   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2037     return false;
2038
2039   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
2040 }
2041
2042 /* Function vect_analyze_loop_operations.
2043
2044    Scan the loop stmts and make sure they are all vectorizable.  */
2045
2046 static opt_result
2047 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
2048 {
2049   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2050   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2051   int nbbs = loop->num_nodes;
2052   int i;
2053   stmt_vec_info stmt_info;
2054   bool need_to_vectorize = false;
2055   bool ok;
2056
2057   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
2058
2059   auto_vec<stmt_info_for_cost> cost_vec;
2060
2061   for (i = 0; i < nbbs; i++)
2062     {
2063       basic_block bb = bbs[i];
2064
2065       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
2066            gsi_next (&si))
2067         {
2068           gphi *phi = si.phi ();
2069           ok = true;
2070
2071           stmt_info = loop_vinfo->lookup_stmt (phi);
2072           if (dump_enabled_p ())
2073             dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G",
2074                              (gimple *) phi);
2075           if (virtual_operand_p (gimple_phi_result (phi)))
2076             continue;
2077
2078           /* Inner-loop loop-closed exit phi in outer-loop vectorization
2079              (i.e., a phi in the tail of the outer-loop).  */
2080           if (! is_loop_header_bb_p (bb))
2081             {
2082               /* FORNOW: we currently don't support the case that these phis
2083                  are not used in the outerloop (unless it is double reduction,
2084                  i.e., this phi is vect_reduction_def), cause this case
2085                  requires to actually do something here.  */
2086               if (STMT_VINFO_LIVE_P (stmt_info)
2087                   && !vect_active_double_reduction_p (stmt_info))
2088                 return opt_result::failure_at (phi,
2089                                                "Unsupported loop-closed phi"
2090                                                " in outer-loop.\n");
2091
2092               /* If PHI is used in the outer loop, we check that its operand
2093                  is defined in the inner loop.  */
2094               if (STMT_VINFO_RELEVANT_P (stmt_info))
2095                 {
2096                   tree phi_op;
2097
2098                   if (gimple_phi_num_args (phi) != 1)
2099                     return opt_result::failure_at (phi, "unsupported phi");
2100
2101                   phi_op = PHI_ARG_DEF (phi, 0);
2102                   stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
2103                   if (!op_def_info)
2104                     return opt_result::failure_at (phi, "unsupported phi\n");
2105
2106                   if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
2107                       && (STMT_VINFO_RELEVANT (op_def_info)
2108                           != vect_used_in_outer_by_reduction))
2109                     return opt_result::failure_at (phi, "unsupported phi\n");
2110
2111                   if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2112                        || (STMT_VINFO_DEF_TYPE (stmt_info)
2113                            == vect_double_reduction_def))
2114                       && !vectorizable_lc_phi (loop_vinfo,
2115                                                stmt_info, NULL, NULL))
2116                     return opt_result::failure_at (phi, "unsupported phi\n");
2117                 }
2118
2119               continue;
2120             }
2121
2122           gcc_assert (stmt_info);
2123
2124           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
2125                || STMT_VINFO_LIVE_P (stmt_info))
2126               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def
2127               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
2128             /* A scalar-dependence cycle that we don't support.  */
2129             return opt_result::failure_at (phi,
2130                                            "not vectorized:"
2131                                            " scalar dependence cycle.\n");
2132
2133           if (STMT_VINFO_RELEVANT_P (stmt_info))
2134             {
2135               need_to_vectorize = true;
2136               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
2137                   && ! PURE_SLP_STMT (stmt_info))
2138                 ok = vectorizable_induction (loop_vinfo,
2139                                              stmt_info, NULL, NULL,
2140                                              &cost_vec);
2141               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2142                         || (STMT_VINFO_DEF_TYPE (stmt_info)
2143                             == vect_double_reduction_def)
2144                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
2145                        && ! PURE_SLP_STMT (stmt_info))
2146                 ok = vectorizable_reduction (loop_vinfo,
2147                                              stmt_info, NULL, NULL, &cost_vec);
2148               else if ((STMT_VINFO_DEF_TYPE (stmt_info)
2149                         == vect_first_order_recurrence)
2150                        && ! PURE_SLP_STMT (stmt_info))
2151                 ok = vectorizable_recurr (loop_vinfo, stmt_info, NULL, NULL,
2152                                            &cost_vec);
2153             }
2154
2155           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
2156           if (ok
2157               && STMT_VINFO_LIVE_P (stmt_info)
2158               && !PURE_SLP_STMT (stmt_info))
2159             ok = vectorizable_live_operation (loop_vinfo, stmt_info, NULL, NULL,
2160                                               -1, false, &cost_vec);
2161
2162           if (!ok)
2163             return opt_result::failure_at (phi,
2164                                            "not vectorized: relevant phi not "
2165                                            "supported: %G",
2166                                            static_cast <gimple *> (phi));
2167         }
2168
2169       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
2170            gsi_next (&si))
2171         {
2172           gimple *stmt = gsi_stmt (si);
2173           if (!gimple_clobber_p (stmt)
2174               && !is_gimple_debug (stmt))
2175             {
2176               opt_result res
2177                 = vect_analyze_stmt (loop_vinfo,
2178                                      loop_vinfo->lookup_stmt (stmt),
2179                                      &need_to_vectorize,
2180                                      NULL, NULL, &cost_vec);
2181               if (!res)
2182                 return res;
2183             }
2184         }
2185     } /* bbs */
2186
2187   add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
2188
2189   /* All operations in the loop are either irrelevant (deal with loop
2190      control, or dead), or only used outside the loop and can be moved
2191      out of the loop (e.g. invariants, inductions).  The loop can be
2192      optimized away by scalar optimizations.  We're better off not
2193      touching this loop.  */
2194   if (!need_to_vectorize)
2195     {
2196       if (dump_enabled_p ())
2197         dump_printf_loc (MSG_NOTE, vect_location,
2198                          "All the computation can be taken out of the loop.\n");
2199       return opt_result::failure_at
2200         (vect_location,
2201          "not vectorized: redundant loop. no profit to vectorize.\n");
2202     }
2203
2204   return opt_result::success ();
2205 }
2206
2207 /* Return true if we know that the iteration count is smaller than the
2208    vectorization factor.  Return false if it isn't, or if we can't be sure
2209    either way.  */
2210
2211 static bool
2212 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
2213 {
2214   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2215
2216   HOST_WIDE_INT max_niter;
2217   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2218     max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
2219   else
2220     max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2221
2222   if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
2223     return true;
2224
2225   return false;
2226 }
2227
2228 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
2229    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
2230    definitely no, or -1 if it's worth retrying.  */
2231
2232 static int
2233 vect_analyze_loop_costing (loop_vec_info loop_vinfo,
2234                            unsigned *suggested_unroll_factor)
2235 {
2236   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2237   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2238
2239   /* Only loops that can handle partially-populated vectors can have iteration
2240      counts less than the vectorization factor.  */
2241   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2242       && vect_known_niters_smaller_than_vf (loop_vinfo))
2243     {
2244       if (dump_enabled_p ())
2245         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2246                          "not vectorized: iteration count smaller than "
2247                          "vectorization factor.\n");
2248       return 0;
2249     }
2250
2251   /* If we know the number of iterations we can do better, for the
2252      epilogue we can also decide whether the main loop leaves us
2253      with enough iterations, prefering a smaller vector epilog then
2254      also possibly used for the case we skip the vector loop.  */
2255   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2256     {
2257       widest_int scalar_niters
2258         = wi::to_widest (LOOP_VINFO_NITERSM1 (loop_vinfo)) + 1;
2259       if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2260         {
2261           loop_vec_info orig_loop_vinfo
2262             = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2263           unsigned lowest_vf
2264             = constant_lower_bound (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo));
2265           int prolog_peeling = 0;
2266           if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2267             prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo);
2268           if (prolog_peeling >= 0
2269               && known_eq (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
2270                            lowest_vf))
2271             {
2272               unsigned gap
2273                 = LOOP_VINFO_PEELING_FOR_GAPS (orig_loop_vinfo) ? 1 : 0;
2274               scalar_niters = ((scalar_niters - gap - prolog_peeling)
2275                                % lowest_vf + gap);
2276             }
2277         }
2278       /* Reject vectorizing for a single scalar iteration, even if
2279          we could in principle implement that using partial vectors.  */
2280       unsigned peeling_gap = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2281       if (scalar_niters <= peeling_gap + 1)
2282         {
2283           if (dump_enabled_p ())
2284             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2285                              "not vectorized: loop only has a single "
2286                              "scalar iteration.\n");
2287           return 0;
2288         }
2289
2290       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2291         {
2292           /* Check that the loop processes at least one full vector.  */
2293           poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2294           if (known_lt (scalar_niters, vf))
2295             {
2296               if (dump_enabled_p ())
2297                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2298                                  "loop does not have enough iterations "
2299                                  "to support vectorization.\n");
2300               return 0;
2301             }
2302
2303           /* If we need to peel an extra epilogue iteration to handle data
2304              accesses with gaps, check that there are enough scalar iterations
2305              available.
2306
2307              The check above is redundant with this one when peeling for gaps,
2308              but the distinction is useful for diagnostics.  */
2309           if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2310               && known_le (scalar_niters, vf))
2311             {
2312               if (dump_enabled_p ())
2313                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2314                                  "loop does not have enough iterations "
2315                                  "to support peeling for gaps.\n");
2316               return 0;
2317             }
2318         }
2319     }
2320
2321   /* If using the "very cheap" model. reject cases in which we'd keep
2322      a copy of the scalar code (even if we might be able to vectorize it).  */
2323   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2324       && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2325           || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2326           || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
2327     {
2328       if (dump_enabled_p ())
2329         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2330                          "some scalar iterations would need to be peeled\n");
2331       return 0;
2332     }
2333
2334   int min_profitable_iters, min_profitable_estimate;
2335   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2336                                       &min_profitable_estimate,
2337                                       suggested_unroll_factor);
2338
2339   if (min_profitable_iters < 0)
2340     {
2341       if (dump_enabled_p ())
2342         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2343                          "not vectorized: vectorization not profitable.\n");
2344       if (dump_enabled_p ())
2345         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2346                          "not vectorized: vector version will never be "
2347                          "profitable.\n");
2348       return -1;
2349     }
2350
2351   int min_scalar_loop_bound = (param_min_vect_loop_bound
2352                                * assumed_vf);
2353
2354   /* Use the cost model only if it is more conservative than user specified
2355      threshold.  */
2356   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
2357                                     min_profitable_iters);
2358
2359   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2360
2361   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2362       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2363     {
2364       if (dump_enabled_p ())
2365         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2366                          "not vectorized: vectorization not profitable.\n");
2367       if (dump_enabled_p ())
2368         dump_printf_loc (MSG_NOTE, vect_location,
2369                          "not vectorized: iteration count smaller than user "
2370                          "specified loop bound parameter or minimum profitable "
2371                          "iterations (whichever is more conservative).\n");
2372       return 0;
2373     }
2374
2375   /* The static profitablity threshold min_profitable_estimate includes
2376      the cost of having to check at runtime whether the scalar loop
2377      should be used instead.  If it turns out that we don't need or want
2378      such a check, the threshold we should use for the static estimate
2379      is simply the point at which the vector loop becomes more profitable
2380      than the scalar loop.  */
2381   if (min_profitable_estimate > min_profitable_iters
2382       && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
2383       && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2384       && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2385       && !vect_apply_runtime_profitability_check_p (loop_vinfo))
2386     {
2387       if (dump_enabled_p ())
2388         dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
2389                          " choice between the scalar and vector loops\n");
2390       min_profitable_estimate = min_profitable_iters;
2391     }
2392
2393   /* If the vector loop needs multiple iterations to be beneficial then
2394      things are probably too close to call, and the conservative thing
2395      would be to stick with the scalar code.  */
2396   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2397       && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
2398     {
2399       if (dump_enabled_p ())
2400         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2401                          "one iteration of the vector loop would be"
2402                          " more expensive than the equivalent number of"
2403                          " iterations of the scalar loop\n");
2404       return 0;
2405     }
2406
2407   HOST_WIDE_INT estimated_niter;
2408
2409   /* If we are vectorizing an epilogue then we know the maximum number of
2410      scalar iterations it will cover is at least one lower than the
2411      vectorization factor of the main loop.  */
2412   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2413     estimated_niter
2414       = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
2415   else
2416     {
2417       estimated_niter = estimated_stmt_executions_int (loop);
2418       if (estimated_niter == -1)
2419         estimated_niter = likely_max_stmt_executions_int (loop);
2420     }
2421   if (estimated_niter != -1
2422       && ((unsigned HOST_WIDE_INT) estimated_niter
2423           < MAX (th, (unsigned) min_profitable_estimate)))
2424     {
2425       if (dump_enabled_p ())
2426         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2427                          "not vectorized: estimated iteration count too "
2428                          "small.\n");
2429       if (dump_enabled_p ())
2430         dump_printf_loc (MSG_NOTE, vect_location,
2431                          "not vectorized: estimated iteration count smaller "
2432                          "than specified loop bound parameter or minimum "
2433                          "profitable iterations (whichever is more "
2434                          "conservative).\n");
2435       return -1;
2436     }
2437
2438   return 1;
2439 }
2440
2441 static opt_result
2442 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
2443                            vec<data_reference_p> *datarefs,
2444                            unsigned int *n_stmts)
2445 {
2446   *n_stmts = 0;
2447   for (unsigned i = 0; i < loop->num_nodes; i++)
2448     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2449          !gsi_end_p (gsi); gsi_next (&gsi))
2450       {
2451         gimple *stmt = gsi_stmt (gsi);
2452         if (is_gimple_debug (stmt))
2453           continue;
2454         ++(*n_stmts);
2455         opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
2456                                                         NULL, 0);
2457         if (!res)
2458           {
2459             if (is_gimple_call (stmt) && loop->safelen)
2460               {
2461                 tree fndecl = gimple_call_fndecl (stmt), op;
2462                 if (fndecl == NULL_TREE
2463                     && gimple_call_internal_p (stmt, IFN_MASK_CALL))
2464                   {
2465                     fndecl = gimple_call_arg (stmt, 0);
2466                     gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
2467                     fndecl = TREE_OPERAND (fndecl, 0);
2468                     gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
2469                   }
2470                 if (fndecl != NULL_TREE)
2471                   {
2472                     cgraph_node *node = cgraph_node::get (fndecl);
2473                     if (node != NULL && node->simd_clones != NULL)
2474                       {
2475                         unsigned int j, n = gimple_call_num_args (stmt);
2476                         for (j = 0; j < n; j++)
2477                           {
2478                             op = gimple_call_arg (stmt, j);
2479                             if (DECL_P (op)
2480                                 || (REFERENCE_CLASS_P (op)
2481                                     && get_base_address (op)))
2482                               break;
2483                           }
2484                         op = gimple_call_lhs (stmt);
2485                         /* Ignore #pragma omp declare simd functions
2486                            if they don't have data references in the
2487                            call stmt itself.  */
2488                         if (j == n
2489                             && !(op
2490                                  && (DECL_P (op)
2491                                      || (REFERENCE_CLASS_P (op)
2492                                          && get_base_address (op)))))
2493                           continue;
2494                       }
2495                   }
2496               }
2497             return res;
2498           }
2499         /* If dependence analysis will give up due to the limit on the
2500            number of datarefs stop here and fail fatally.  */
2501         if (datarefs->length ()
2502             > (unsigned)param_loop_max_datarefs_for_datadeps)
2503           return opt_result::failure_at (stmt, "exceeded param "
2504                                          "loop-max-datarefs-for-datadeps\n");
2505       }
2506   return opt_result::success ();
2507 }
2508
2509 /* Look for SLP-only access groups and turn each individual access into its own
2510    group.  */
2511 static void
2512 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2513 {
2514   unsigned int i;
2515   struct data_reference *dr;
2516
2517   DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2518
2519   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2520   FOR_EACH_VEC_ELT (datarefs, i, dr)
2521     {
2522       gcc_assert (DR_REF (dr));
2523       stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
2524
2525       /* Check if the load is a part of an interleaving chain.  */
2526       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2527         {
2528           stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2529           dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2530           unsigned int group_size = DR_GROUP_SIZE (first_element);
2531
2532           /* Check if SLP-only groups.  */
2533           if (!STMT_SLP_TYPE (stmt_info)
2534               && STMT_VINFO_SLP_VECT_ONLY (first_element))
2535             {
2536               /* Dissolve the group.  */
2537               STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2538
2539               stmt_vec_info vinfo = first_element;
2540               while (vinfo)
2541                 {
2542                   stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2543                   DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2544                   DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2545                   DR_GROUP_SIZE (vinfo) = 1;
2546                   if (STMT_VINFO_STRIDED_P (first_element)
2547                       /* We cannot handle stores with gaps.  */
2548                       || DR_IS_WRITE (dr_info->dr))
2549                     {
2550                       STMT_VINFO_STRIDED_P (vinfo) = true;
2551                       DR_GROUP_GAP (vinfo) = 0;
2552                     }
2553                   else
2554                     DR_GROUP_GAP (vinfo) = group_size - 1;
2555                   /* Duplicate and adjust alignment info, it needs to
2556                      be present on each group leader, see dr_misalignment.  */
2557                   if (vinfo != first_element)
2558                     {
2559                       dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2560                       dr_info2->target_alignment = dr_info->target_alignment;
2561                       int misalignment = dr_info->misalignment;
2562                       if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2563                         {
2564                           HOST_WIDE_INT diff
2565                             = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2566                                - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2567                           unsigned HOST_WIDE_INT align_c
2568                             = dr_info->target_alignment.to_constant ();
2569                           misalignment = (misalignment + diff) % align_c;
2570                         }
2571                       dr_info2->misalignment = misalignment;
2572                     }
2573                   vinfo = next;
2574                 }
2575             }
2576         }
2577     }
2578 }
2579
2580 /* Determine if operating on full vectors for LOOP_VINFO might leave
2581    some scalar iterations still to do.  If so, decide how we should
2582    handle those scalar iterations.  The possibilities are:
2583
2584    (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2585        In this case:
2586
2587          LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2588          LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2589          LOOP_VINFO_PEELING_FOR_NITER == false
2590
2591    (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2592        to handle the remaining scalar iterations.  In this case:
2593
2594          LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2595          LOOP_VINFO_PEELING_FOR_NITER == true
2596
2597        There are two choices:
2598
2599        (2a) Consider vectorizing the epilogue loop at the same VF as the
2600             main loop, but using partial vectors instead of full vectors.
2601             In this case:
2602
2603               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2604
2605        (2b) Consider vectorizing the epilogue loop at lower VFs only.
2606             In this case:
2607
2608               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2609  */
2610
2611 opt_result
2612 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo)
2613 {
2614   /* Determine whether there would be any scalar iterations left over.  */
2615   bool need_peeling_or_partial_vectors_p
2616     = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2617
2618   /* Decide whether to vectorize the loop with partial vectors.  */
2619   LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2620   LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2621   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2622       && need_peeling_or_partial_vectors_p)
2623     {
2624       /* For partial-vector-usage=1, try to push the handling of partial
2625          vectors to the epilogue, with the main loop continuing to operate
2626          on full vectors.
2627
2628          If we are unrolling we also do not want to use partial vectors. This
2629          is to avoid the overhead of generating multiple masks and also to
2630          avoid having to execute entire iterations of FALSE masked instructions
2631          when dealing with one or less full iterations.
2632
2633          ??? We could then end up failing to use partial vectors if we
2634          decide to peel iterations into a prologue, and if the main loop
2635          then ends up processing fewer than VF iterations.  */
2636       if ((param_vect_partial_vector_usage == 1
2637            || loop_vinfo->suggested_unroll_factor > 1)
2638           && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2639           && !vect_known_niters_smaller_than_vf (loop_vinfo))
2640         LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2641       else
2642         LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2643     }
2644
2645   if (dump_enabled_p ())
2646     dump_printf_loc (MSG_NOTE, vect_location,
2647                      "operating on %s vectors%s.\n",
2648                      LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2649                      ? "partial" : "full",
2650                      LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2651                      ? " for epilogue loop" : "");
2652
2653   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2654     = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2655        && need_peeling_or_partial_vectors_p);
2656
2657   return opt_result::success ();
2658 }
2659
2660 /* Function vect_analyze_loop_2.
2661
2662    Apply a set of analyses on LOOP specified by LOOP_VINFO, the different
2663    analyses will record information in some members of LOOP_VINFO.  FATAL
2664    indicates if some analysis meets fatal error.  If one non-NULL pointer
2665    SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one
2666    worked out suggested unroll factor, while one NULL pointer shows it's
2667    going to apply the suggested unroll factor.  SLP_DONE_FOR_SUGGESTED_UF
2668    is to hold the slp decision when the suggested unroll factor is worked
2669    out.  */
2670 static opt_result
2671 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
2672                      unsigned *suggested_unroll_factor,
2673                      bool& slp_done_for_suggested_uf)
2674 {
2675   opt_result ok = opt_result::success ();
2676   int res;
2677   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2678   poly_uint64 min_vf = 2;
2679   loop_vec_info orig_loop_vinfo = NULL;
2680
2681   /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2682      loop_vec_info of the first vectorized loop.  */
2683   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2684     orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2685   else
2686     orig_loop_vinfo = loop_vinfo;
2687   gcc_assert (orig_loop_vinfo);
2688
2689   /* The first group of checks is independent of the vector size.  */
2690   fatal = true;
2691
2692   if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2693       && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2694     return opt_result::failure_at (vect_location,
2695                                    "not vectorized: simd if(0)\n");
2696
2697   /* Find all data references in the loop (which correspond to vdefs/vuses)
2698      and analyze their evolution in the loop.  */
2699
2700   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2701
2702   /* Gather the data references and count stmts in the loop.  */
2703   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2704     {
2705       opt_result res
2706         = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2707                                      &LOOP_VINFO_DATAREFS (loop_vinfo),
2708                                      &LOOP_VINFO_N_STMTS (loop_vinfo));
2709       if (!res)
2710         {
2711           if (dump_enabled_p ())
2712             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2713                              "not vectorized: loop contains function "
2714                              "calls or data references that cannot "
2715                              "be analyzed\n");
2716           return res;
2717         }
2718       loop_vinfo->shared->save_datarefs ();
2719     }
2720   else
2721     loop_vinfo->shared->check_datarefs ();
2722
2723   /* Analyze the data references and also adjust the minimal
2724      vectorization factor according to the loads and stores.  */
2725
2726   ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2727   if (!ok)
2728     {
2729       if (dump_enabled_p ())
2730         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2731                          "bad data references.\n");
2732       return ok;
2733     }
2734
2735   /* Check if we are applying unroll factor now.  */
2736   bool applying_suggested_uf = loop_vinfo->suggested_unroll_factor > 1;
2737   gcc_assert (!applying_suggested_uf || !suggested_unroll_factor);
2738
2739   /* If the slp decision is false when suggested unroll factor is worked
2740      out, and we are applying suggested unroll factor, we can simply skip
2741      all slp related analyses this time.  */
2742   bool slp = !applying_suggested_uf || slp_done_for_suggested_uf;
2743
2744   /* Classify all cross-iteration scalar data-flow cycles.
2745      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
2746   vect_analyze_scalar_cycles (loop_vinfo, slp);
2747
2748   vect_pattern_recog (loop_vinfo);
2749
2750   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2751
2752   /* Analyze the access patterns of the data-refs in the loop (consecutive,
2753      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
2754
2755   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2756   if (!ok)
2757     {
2758       if (dump_enabled_p ())
2759         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2760                          "bad data access.\n");
2761       return ok;
2762     }
2763
2764   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
2765
2766   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2767   if (!ok)
2768     {
2769       if (dump_enabled_p ())
2770         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2771                          "unexpected pattern.\n");
2772       return ok;
2773     }
2774
2775   /* While the rest of the analysis below depends on it in some way.  */
2776   fatal = false;
2777
2778   /* Analyze data dependences between the data-refs in the loop
2779      and adjust the maximum vectorization factor according to
2780      the dependences.
2781      FORNOW: fail at the first data dependence that we encounter.  */
2782
2783   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2784   if (!ok)
2785     {
2786       if (dump_enabled_p ())
2787         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2788                          "bad data dependence.\n");
2789       return ok;
2790     }
2791   if (max_vf != MAX_VECTORIZATION_FACTOR
2792       && maybe_lt (max_vf, min_vf))
2793     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2794   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2795
2796   ok = vect_determine_vectorization_factor (loop_vinfo);
2797   if (!ok)
2798     {
2799       if (dump_enabled_p ())
2800         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2801                          "can't determine vectorization factor.\n");
2802       return ok;
2803     }
2804   if (max_vf != MAX_VECTORIZATION_FACTOR
2805       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2806     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2807
2808   /* Compute the scalar iteration cost.  */
2809   vect_compute_single_scalar_iteration_cost (loop_vinfo);
2810
2811   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2812
2813   if (slp)
2814     {
2815       /* Check the SLP opportunities in the loop, analyze and build
2816          SLP trees.  */
2817       ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo));
2818       if (!ok)
2819         return ok;
2820
2821       /* If there are any SLP instances mark them as pure_slp.  */
2822       slp = vect_make_slp_decision (loop_vinfo);
2823       if (slp)
2824         {
2825           /* Find stmts that need to be both vectorized and SLPed.  */
2826           vect_detect_hybrid_slp (loop_vinfo);
2827
2828           /* Update the vectorization factor based on the SLP decision.  */
2829           vect_update_vf_for_slp (loop_vinfo);
2830
2831           /* Optimize the SLP graph with the vectorization factor fixed.  */
2832           vect_optimize_slp (loop_vinfo);
2833
2834           /* Gather the loads reachable from the SLP graph entries.  */
2835           vect_gather_slp_loads (loop_vinfo);
2836         }
2837     }
2838
2839   bool saved_can_use_partial_vectors_p
2840     = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2841
2842   /* We don't expect to have to roll back to anything other than an empty
2843      set of rgroups.  */
2844   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2845
2846   /* This is the point where we can re-start analysis with SLP forced off.  */
2847 start_over:
2848
2849   /* Apply the suggested unrolling factor, this was determined by the backend
2850      during finish_cost the first time we ran the analyzis for this
2851      vector mode.  */
2852   if (applying_suggested_uf)
2853     LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2854
2855   /* Now the vectorization factor is final.  */
2856   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2857   gcc_assert (known_ne (vectorization_factor, 0U));
2858
2859   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2860     {
2861       dump_printf_loc (MSG_NOTE, vect_location,
2862                        "vectorization_factor = ");
2863       dump_dec (MSG_NOTE, vectorization_factor);
2864       dump_printf (MSG_NOTE, ", niters = %wd\n",
2865                    LOOP_VINFO_INT_NITERS (loop_vinfo));
2866     }
2867
2868   loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2869
2870   /* Analyze the alignment of the data-refs in the loop.
2871      Fail if a data reference is found that cannot be vectorized.  */
2872
2873   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2874   if (!ok)
2875     {
2876       if (dump_enabled_p ())
2877         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2878                          "bad data alignment.\n");
2879       return ok;
2880     }
2881
2882   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2883      It is important to call pruning after vect_analyze_data_ref_accesses,
2884      since we use grouping information gathered by interleaving analysis.  */
2885   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2886   if (!ok)
2887     return ok;
2888
2889   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2890      vectorization, since we do not want to add extra peeling or
2891      add versioning for alignment.  */
2892   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2893     /* This pass will decide on using loop versioning and/or loop peeling in
2894        order to enhance the alignment of data references in the loop.  */
2895     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2896   if (!ok)
2897     return ok;
2898
2899   if (slp)
2900     {
2901       /* Analyze operations in the SLP instances.  Note this may
2902          remove unsupported SLP instances which makes the above
2903          SLP kind detection invalid.  */
2904       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2905       vect_slp_analyze_operations (loop_vinfo);
2906       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2907         {
2908           ok = opt_result::failure_at (vect_location,
2909                                        "unsupported SLP instances\n");
2910           goto again;
2911         }
2912
2913       /* Check whether any load in ALL SLP instances is possibly permuted.  */
2914       slp_tree load_node, slp_root;
2915       unsigned i, x;
2916       slp_instance instance;
2917       bool can_use_lanes = true;
2918       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2919         {
2920           slp_root = SLP_INSTANCE_TREE (instance);
2921           int group_size = SLP_TREE_LANES (slp_root);
2922           tree vectype = SLP_TREE_VECTYPE (slp_root);
2923           bool loads_permuted = false;
2924           FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2925             {
2926               if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2927                 continue;
2928               unsigned j;
2929               stmt_vec_info load_info;
2930               FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2931                 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2932                   {
2933                     loads_permuted = true;
2934                     break;
2935                   }
2936             }
2937
2938           /* If the loads and stores can be handled with load/store-lane
2939              instructions record it and move on to the next instance.  */
2940           if (loads_permuted
2941               && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2942               && vect_store_lanes_supported (vectype, group_size, false)
2943                    != IFN_LAST)
2944             {
2945               FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2946                 if (STMT_VINFO_GROUPED_ACCESS
2947                       (SLP_TREE_REPRESENTATIVE (load_node)))
2948                   {
2949                     stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2950                         (SLP_TREE_REPRESENTATIVE (load_node));
2951                     /* Use SLP for strided accesses (or if we can't
2952                        load-lanes).  */
2953                     if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2954                         || vect_load_lanes_supported
2955                              (STMT_VINFO_VECTYPE (stmt_vinfo),
2956                               DR_GROUP_SIZE (stmt_vinfo), false) == IFN_LAST)
2957                       break;
2958                   }
2959
2960               can_use_lanes
2961                 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
2962
2963               if (can_use_lanes && dump_enabled_p ())
2964                 dump_printf_loc (MSG_NOTE, vect_location,
2965                                  "SLP instance %p can use load/store-lanes\n",
2966                                  (void *) instance);
2967             }
2968           else
2969             {
2970               can_use_lanes = false;
2971               break;
2972             }
2973         }
2974
2975       /* If all SLP instances can use load/store-lanes abort SLP and try again
2976          with SLP disabled.  */
2977       if (can_use_lanes)
2978         {
2979           ok = opt_result::failure_at (vect_location,
2980                                        "Built SLP cancelled: can use "
2981                                        "load/store-lanes\n");
2982           if (dump_enabled_p ())
2983             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2984                              "Built SLP cancelled: all SLP instances support "
2985                              "load/store-lanes\n");
2986           goto again;
2987         }
2988     }
2989
2990   /* Dissolve SLP-only groups.  */
2991   vect_dissolve_slp_only_groups (loop_vinfo);
2992
2993   /* Scan all the remaining operations in the loop that are not subject
2994      to SLP and make sure they are vectorizable.  */
2995   ok = vect_analyze_loop_operations (loop_vinfo);
2996   if (!ok)
2997     {
2998       if (dump_enabled_p ())
2999         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3000                          "bad operation or unsupported loop bound.\n");
3001       return ok;
3002     }
3003
3004   /* For now, we don't expect to mix both masking and length approaches for one
3005      loop, disable it if both are recorded.  */
3006   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3007       && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
3008       && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
3009     {
3010       if (dump_enabled_p ())
3011         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3012                          "can't vectorize a loop with partial vectors"
3013                          " because we don't expect to mix different"
3014                          " approaches with partial vectors for the"
3015                          " same loop.\n");
3016       LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3017     }
3018
3019   /* If we still have the option of using partial vectors,
3020      check whether we can generate the necessary loop controls.  */
3021   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
3022     {
3023       if (!LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
3024         {
3025           if (!vect_verify_full_masking (loop_vinfo)
3026               && !vect_verify_full_masking_avx512 (loop_vinfo))
3027             LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3028         }
3029       else /* !LOOP_VINFO_LENS (loop_vinfo).is_empty () */
3030         if (!vect_verify_loop_lens (loop_vinfo))
3031           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3032     }
3033
3034   /* If we're vectorizing a loop that uses length "controls" and
3035      can iterate more than once, we apply decrementing IV approach
3036      in loop control.  */
3037   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3038       && LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_len
3039       && LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0
3040       && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3041            && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
3042                         LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
3043     LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
3044
3045   /* If a loop uses length controls and has a decrementing loop control IV,
3046      we will normally pass that IV through a MIN_EXPR to calcaluate the
3047      basis for the length controls.  E.g. in a loop that processes one
3048      element per scalar iteration, the number of elements would be
3049      MIN_EXPR <N, VF>, where N is the number of scalar iterations left.
3050
3051      This MIN_EXPR approach allows us to use pointer IVs with an invariant
3052      step, since only the final iteration of the vector loop can have
3053      inactive lanes.
3054
3055      However, some targets have a dedicated instruction for calculating the
3056      preferred length, given the total number of elements that still need to
3057      be processed.  This is encapsulated in the SELECT_VL internal function.
3058
3059      If the target supports SELECT_VL, we can use it instead of MIN_EXPR
3060      to determine the basis for the length controls.  However, unlike the
3061      MIN_EXPR calculation, the SELECT_VL calculation can decide to make
3062      lanes inactive in any iteration of the vector loop, not just the last
3063      iteration.  This SELECT_VL approach therefore requires us to use pointer
3064      IVs with variable steps.
3065
3066      Once we've decided how many elements should be processed by one
3067      iteration of the vector loop, we need to populate the rgroup controls.
3068      If a loop has multiple rgroups, we need to make sure that those rgroups
3069      "line up" (that is, they must be consistent about which elements are
3070      active and which aren't).  This is done by vect_adjust_loop_lens_control.
3071
3072      In principle, it would be possible to use vect_adjust_loop_lens_control
3073      on either the result of a MIN_EXPR or the result of a SELECT_VL.
3074      However:
3075
3076      (1) In practice, it only makes sense to use SELECT_VL when a vector
3077          operation will be controlled directly by the result.  It is not
3078          worth using SELECT_VL if it would only be the input to other
3079          calculations.
3080
3081      (2) If we use SELECT_VL for an rgroup that has N controls, each associated
3082          pointer IV will need N updates by a variable amount (N-1 updates
3083          within the iteration and 1 update to move to the next iteration).
3084
3085      Because of this, we prefer to use the MIN_EXPR approach whenever there
3086      is more than one length control.
3087
3088      In addition, SELECT_VL always operates to a granularity of 1 unit.
3089      If we wanted to use it to control an SLP operation on N consecutive
3090      elements, we would need to make the SELECT_VL inputs measure scalar
3091      iterations (rather than elements) and then multiply the SELECT_VL
3092      result by N.  But using SELECT_VL this way is inefficient because
3093      of (1) above.
3094
3095      2. We don't apply SELECT_VL on single-rgroup when both (1) and (2) are
3096         satisfied:
3097
3098      (1). LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) is true.
3099      (2). LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () is true.
3100
3101      Since SELECT_VL (variable step) will make SCEV analysis failed and then
3102      we will fail to gain benefits of following unroll optimizations. We prefer
3103      using the MIN_EXPR approach in this situation.  */
3104   if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
3105     {
3106       tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
3107       if (direct_internal_fn_supported_p (IFN_SELECT_VL, iv_type,
3108                                           OPTIMIZE_FOR_SPEED)
3109           && LOOP_VINFO_LENS (loop_vinfo).length () == 1
3110           && LOOP_VINFO_LENS (loop_vinfo)[0].factor == 1 && !slp
3111           && (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3112               || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()))
3113         LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true;
3114     }
3115
3116   /* Decide whether this loop_vinfo should use partial vectors or peeling,
3117      assuming that the loop will be used as a main loop.  We will redo
3118      this analysis later if we instead decide to use the loop as an
3119      epilogue loop.  */
3120   ok = vect_determine_partial_vectors_and_peeling (loop_vinfo);
3121   if (!ok)
3122     return ok;
3123
3124   /* If we're vectorizing an epilogue loop, the vectorized loop either needs
3125      to be able to handle fewer than VF scalars, or needs to have a lower VF
3126      than the main loop.  */
3127   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
3128       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3129     {
3130       poly_uint64 unscaled_vf
3131         = exact_div (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
3132                      orig_loop_vinfo->suggested_unroll_factor);
3133       if (maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo), unscaled_vf))
3134         return opt_result::failure_at (vect_location,
3135                                        "Vectorization factor too high for"
3136                                        " epilogue loop.\n");
3137     }
3138
3139   /* Check the costings of the loop make vectorizing worthwhile.  */
3140   res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
3141   if (res < 0)
3142     {
3143       ok = opt_result::failure_at (vect_location,
3144                                    "Loop costings may not be worthwhile.\n");
3145       goto again;
3146     }
3147   if (!res)
3148     return opt_result::failure_at (vect_location,
3149                                    "Loop costings not worthwhile.\n");
3150
3151   /* If an epilogue loop is required make sure we can create one.  */
3152   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
3153       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
3154     {
3155       if (dump_enabled_p ())
3156         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
3157       if (!vect_can_advance_ivs_p (loop_vinfo)
3158           || !slpeel_can_duplicate_loop_p (loop,
3159                                            LOOP_VINFO_IV_EXIT (loop_vinfo),
3160                                            LOOP_VINFO_IV_EXIT (loop_vinfo)))
3161         {
3162           ok = opt_result::failure_at (vect_location,
3163                                        "not vectorized: can't create required "
3164                                        "epilog loop\n");
3165           goto again;
3166         }
3167     }
3168
3169   /* During peeling, we need to check if number of loop iterations is
3170      enough for both peeled prolog loop and vector loop.  This check
3171      can be merged along with threshold check of loop versioning, so
3172      increase threshold for this case if necessary.
3173
3174      If we are analyzing an epilogue we still want to check what its
3175      versioning threshold would be.  If we decide to vectorize the epilogues we
3176      will want to use the lowest versioning threshold of all epilogues and main
3177      loop.  This will enable us to enter a vectorized epilogue even when
3178      versioning the loop.  We can't simply check whether the epilogue requires
3179      versioning though since we may have skipped some versioning checks when
3180      analyzing the epilogue.  For instance, checks for alias versioning will be
3181      skipped when dealing with epilogues as we assume we already checked them
3182      for the main loop.  So instead we always check the 'orig_loop_vinfo'.  */
3183   if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
3184     {
3185       poly_uint64 niters_th = 0;
3186       unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3187
3188       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
3189         {
3190           /* Niters for peeled prolog loop.  */
3191           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3192             {
3193               dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
3194               tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
3195               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
3196             }
3197           else
3198             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3199         }
3200
3201       /* Niters for at least one iteration of vectorized loop.  */
3202       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3203         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3204       /* One additional iteration because of peeling for gap.  */
3205       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3206         niters_th += 1;
3207
3208       /*  Use the same condition as vect_transform_loop to decide when to use
3209           the cost to determine a versioning threshold.  */
3210       if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3211           && ordered_p (th, niters_th))
3212         niters_th = ordered_max (poly_uint64 (th), niters_th);
3213
3214       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
3215     }
3216
3217   gcc_assert (known_eq (vectorization_factor,
3218                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
3219
3220   slp_done_for_suggested_uf = slp;
3221
3222   /* Ok to vectorize!  */
3223   LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
3224   return opt_result::success ();
3225
3226 again:
3227   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
3228   gcc_assert (!ok);
3229
3230   /* Try again with SLP forced off but if we didn't do any SLP there is
3231      no point in re-trying.  */
3232   if (!slp)
3233     return ok;
3234
3235   /* If the slp decision is true when suggested unroll factor is worked
3236      out, and we are applying suggested unroll factor, we don't need to
3237      re-try any more.  */
3238   if (applying_suggested_uf && slp_done_for_suggested_uf)
3239     return ok;
3240
3241   /* If there are reduction chains re-trying will fail anyway.  */
3242   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
3243     return ok;
3244
3245   /* Likewise if the grouped loads or stores in the SLP cannot be handled
3246      via interleaving or lane instructions.  */
3247   slp_instance instance;
3248   slp_tree node;
3249   unsigned i, j;
3250   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
3251     {
3252       stmt_vec_info vinfo;
3253       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
3254       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
3255         continue;
3256       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3257       unsigned int size = DR_GROUP_SIZE (vinfo);
3258       tree vectype = STMT_VINFO_VECTYPE (vinfo);
3259       if (vect_store_lanes_supported (vectype, size, false) == IFN_LAST
3260          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
3261          && ! vect_grouped_store_supported (vectype, size))
3262         return opt_result::failure_at (vinfo->stmt,
3263                                        "unsupported grouped store\n");
3264       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
3265         {
3266           vinfo = SLP_TREE_REPRESENTATIVE (node);
3267           if (STMT_VINFO_GROUPED_ACCESS (vinfo))
3268             {
3269               vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3270               bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
3271               size = DR_GROUP_SIZE (vinfo);
3272               vectype = STMT_VINFO_VECTYPE (vinfo);
3273               if (vect_load_lanes_supported (vectype, size, false) == IFN_LAST
3274                   && ! vect_grouped_load_supported (vectype, single_element_p,
3275                                                     size))
3276                 return opt_result::failure_at (vinfo->stmt,
3277                                                "unsupported grouped load\n");
3278             }
3279         }
3280     }
3281
3282   if (dump_enabled_p ())
3283     dump_printf_loc (MSG_NOTE, vect_location,
3284                      "re-trying with SLP disabled\n");
3285
3286   /* Roll back state appropriately.  No SLP this time.  */
3287   slp = false;
3288   /* Restore vectorization factor as it were without SLP.  */
3289   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
3290   /* Free the SLP instances.  */
3291   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
3292     vect_free_slp_instance (instance);
3293   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
3294   /* Reset SLP type to loop_vect on all stmts.  */
3295   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
3296     {
3297       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3298       for (gimple_stmt_iterator si = gsi_start_phis (bb);
3299            !gsi_end_p (si); gsi_next (&si))
3300         {
3301           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3302           STMT_SLP_TYPE (stmt_info) = loop_vect;
3303           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
3304               || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
3305             {
3306               /* vectorizable_reduction adjusts reduction stmt def-types,
3307                  restore them to that of the PHI.  */
3308               STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
3309                 = STMT_VINFO_DEF_TYPE (stmt_info);
3310               STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
3311                                         (STMT_VINFO_REDUC_DEF (stmt_info)))
3312                 = STMT_VINFO_DEF_TYPE (stmt_info);
3313             }
3314         }
3315       for (gimple_stmt_iterator si = gsi_start_bb (bb);
3316            !gsi_end_p (si); gsi_next (&si))
3317         {
3318           if (is_gimple_debug (gsi_stmt (si)))
3319             continue;
3320           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3321           STMT_SLP_TYPE (stmt_info) = loop_vect;
3322           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
3323             {
3324               stmt_vec_info pattern_stmt_info
3325                 = STMT_VINFO_RELATED_STMT (stmt_info);
3326               if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
3327                 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
3328
3329               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
3330               STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
3331               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
3332                    !gsi_end_p (pi); gsi_next (&pi))
3333                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
3334                   = loop_vect;
3335             }
3336         }
3337     }
3338   /* Free optimized alias test DDRS.  */
3339   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
3340   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
3341   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
3342   /* Reset target cost data.  */
3343   delete loop_vinfo->vector_costs;
3344   loop_vinfo->vector_costs = nullptr;
3345   /* Reset accumulated rgroup information.  */
3346   LOOP_VINFO_MASKS (loop_vinfo).mask_set.empty ();
3347   release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
3348   release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
3349   /* Reset assorted flags.  */
3350   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
3351   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
3352   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
3353   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
3354   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3355     = saved_can_use_partial_vectors_p;
3356
3357   goto start_over;
3358 }
3359
3360 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
3361    to be better than vectorizing it using OLD_LOOP_VINFO.  Assume that
3362    OLD_LOOP_VINFO is better unless something specifically indicates
3363    otherwise.
3364
3365    Note that this deliberately isn't a partial order.  */
3366
3367 static bool
3368 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
3369                           loop_vec_info old_loop_vinfo)
3370 {
3371   struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
3372   gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
3373
3374   poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
3375   poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
3376
3377   /* Always prefer a VF of loop->simdlen over any other VF.  */
3378   if (loop->simdlen)
3379     {
3380       bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
3381       bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
3382       if (new_simdlen_p != old_simdlen_p)
3383         return new_simdlen_p;
3384     }
3385
3386   const auto *old_costs = old_loop_vinfo->vector_costs;
3387   const auto *new_costs = new_loop_vinfo->vector_costs;
3388   if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
3389     return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
3390
3391   return new_costs->better_main_loop_than_p (old_costs);
3392 }
3393
3394 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO.  Return
3395    true if we should.  */
3396
3397 static bool
3398 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
3399                         loop_vec_info old_loop_vinfo)
3400 {
3401   if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
3402     return false;
3403
3404   if (dump_enabled_p ())
3405     dump_printf_loc (MSG_NOTE, vect_location,
3406                      "***** Preferring vector mode %s to vector mode %s\n",
3407                      GET_MODE_NAME (new_loop_vinfo->vector_mode),
3408                      GET_MODE_NAME (old_loop_vinfo->vector_mode));
3409   return true;
3410 }
3411
3412 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
3413    not NULL.  Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
3414    MODE_I to the next mode useful to analyze.
3415    Return the loop_vinfo on success and wrapped null on failure.  */
3416
3417 static opt_loop_vec_info
3418 vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
3419                      const vect_loop_form_info *loop_form_info,
3420                      loop_vec_info main_loop_vinfo,
3421                      const vector_modes &vector_modes, unsigned &mode_i,
3422                      machine_mode &autodetected_vector_mode,
3423                      bool &fatal)
3424 {
3425   loop_vec_info loop_vinfo
3426     = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3427
3428   machine_mode vector_mode = vector_modes[mode_i];
3429   loop_vinfo->vector_mode = vector_mode;
3430   unsigned int suggested_unroll_factor = 1;
3431   bool slp_done_for_suggested_uf = false;
3432
3433   /* Run the main analysis.  */
3434   opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,
3435                                         &suggested_unroll_factor,
3436                                         slp_done_for_suggested_uf);
3437   if (dump_enabled_p ())
3438     dump_printf_loc (MSG_NOTE, vect_location,
3439                      "***** Analysis %s with vector mode %s\n",
3440                      res ? "succeeded" : " failed",
3441                      GET_MODE_NAME (loop_vinfo->vector_mode));
3442
3443   if (res && !main_loop_vinfo && suggested_unroll_factor > 1)
3444     {
3445       if (dump_enabled_p ())
3446         dump_printf_loc (MSG_NOTE, vect_location,
3447                          "***** Re-trying analysis for unrolling"
3448                          " with unroll factor %d and slp %s.\n",
3449                          suggested_unroll_factor,
3450                          slp_done_for_suggested_uf ? "on" : "off");
3451       loop_vec_info unroll_vinfo
3452         = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3453       unroll_vinfo->vector_mode = vector_mode;
3454       unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
3455       opt_result new_res = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL,
3456                                                 slp_done_for_suggested_uf);
3457       if (new_res)
3458         {
3459           delete loop_vinfo;
3460           loop_vinfo = unroll_vinfo;
3461         }
3462       else
3463         delete unroll_vinfo;
3464     }
3465
3466   /* Remember the autodetected vector mode.  */
3467   if (vector_mode == VOIDmode)
3468     autodetected_vector_mode = loop_vinfo->vector_mode;
3469
3470   /* Advance mode_i, first skipping modes that would result in the
3471      same analysis result.  */
3472   while (mode_i + 1 < vector_modes.length ()
3473          && vect_chooses_same_modes_p (loop_vinfo,
3474                                        vector_modes[mode_i + 1]))
3475     {
3476       if (dump_enabled_p ())
3477         dump_printf_loc (MSG_NOTE, vect_location,
3478                          "***** The result for vector mode %s would"
3479                          " be the same\n",
3480                          GET_MODE_NAME (vector_modes[mode_i + 1]));
3481       mode_i += 1;
3482     }
3483   if (mode_i + 1 < vector_modes.length ()
3484       && VECTOR_MODE_P (autodetected_vector_mode)
3485       && (related_vector_mode (vector_modes[mode_i + 1],
3486                                GET_MODE_INNER (autodetected_vector_mode))
3487           == autodetected_vector_mode)
3488       && (related_vector_mode (autodetected_vector_mode,
3489                                GET_MODE_INNER (vector_modes[mode_i + 1]))
3490           == vector_modes[mode_i + 1]))
3491     {
3492       if (dump_enabled_p ())
3493         dump_printf_loc (MSG_NOTE, vect_location,
3494                          "***** Skipping vector mode %s, which would"
3495                          " repeat the analysis for %s\n",
3496                          GET_MODE_NAME (vector_modes[mode_i + 1]),
3497                          GET_MODE_NAME (autodetected_vector_mode));
3498       mode_i += 1;
3499     }
3500   mode_i++;
3501
3502   if (!res)
3503     {
3504       delete loop_vinfo;
3505       if (fatal)
3506         gcc_checking_assert (main_loop_vinfo == NULL);
3507       return opt_loop_vec_info::propagate_failure (res);
3508     }
3509
3510   return opt_loop_vec_info::success (loop_vinfo);
3511 }
3512
3513 /* Function vect_analyze_loop.
3514
3515    Apply a set of analyses on LOOP, and create a loop_vec_info struct
3516    for it.  The different analyses will record information in the
3517    loop_vec_info struct.  */
3518 opt_loop_vec_info
3519 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
3520 {
3521   DUMP_VECT_SCOPE ("analyze_loop_nest");
3522
3523   if (loop_outer (loop)
3524       && loop_vec_info_for_loop (loop_outer (loop))
3525       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
3526     return opt_loop_vec_info::failure_at (vect_location,
3527                                           "outer-loop already vectorized.\n");
3528
3529   if (!find_loop_nest (loop, &shared->loop_nest))
3530     return opt_loop_vec_info::failure_at
3531       (vect_location,
3532        "not vectorized: loop nest containing two or more consecutive inner"
3533        " loops cannot be vectorized\n");
3534
3535   /* Analyze the loop form.  */
3536   vect_loop_form_info loop_form_info;
3537   opt_result res = vect_analyze_loop_form (loop, &loop_form_info);
3538   if (!res)
3539     {
3540       if (dump_enabled_p ())
3541         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3542                          "bad loop form.\n");
3543       return opt_loop_vec_info::propagate_failure (res);
3544     }
3545   if (!integer_onep (loop_form_info.assumptions))
3546     {
3547       /* We consider to vectorize this loop by versioning it under
3548          some assumptions.  In order to do this, we need to clear
3549          existing information computed by scev and niter analyzer.  */
3550       scev_reset_htab ();
3551       free_numbers_of_iterations_estimates (loop);
3552       /* Also set flag for this loop so that following scev and niter
3553          analysis are done under the assumptions.  */
3554       loop_constraint_set (loop, LOOP_C_FINITE);
3555     }
3556
3557   auto_vector_modes vector_modes;
3558   /* Autodetect first vector size we try.  */
3559   vector_modes.safe_push (VOIDmode);
3560   unsigned int autovec_flags
3561     = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
3562                                                     loop->simdlen != 0);
3563   bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
3564                              && !unlimited_cost_model (loop));
3565   machine_mode autodetected_vector_mode = VOIDmode;
3566   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
3567   unsigned int mode_i = 0;
3568   unsigned HOST_WIDE_INT simdlen = loop->simdlen;
3569
3570   /* Keep track of the VF for each mode.  Initialize all to 0 which indicates
3571      a mode has not been analyzed.  */
3572   auto_vec<poly_uint64, 8> cached_vf_per_mode;
3573   for (unsigned i = 0; i < vector_modes.length (); ++i)
3574     cached_vf_per_mode.safe_push (0);
3575
3576   /* First determine the main loop vectorization mode, either the first
3577      one that works, starting with auto-detecting the vector mode and then
3578      following the targets order of preference, or the one with the
3579      lowest cost if pick_lowest_cost_p.  */
3580   while (1)
3581     {
3582       bool fatal;
3583       unsigned int last_mode_i = mode_i;
3584       /* Set cached VF to -1 prior to analysis, which indicates a mode has
3585          failed.  */
3586       cached_vf_per_mode[last_mode_i] = -1;
3587       opt_loop_vec_info loop_vinfo
3588         = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3589                                NULL, vector_modes, mode_i,
3590                                autodetected_vector_mode, fatal);
3591       if (fatal)
3592         break;
3593
3594       if (loop_vinfo)
3595         {
3596           /*  Analyzis has been successful so update the VF value.  The
3597               VF should always be a multiple of unroll_factor and we want to
3598               capture the original VF here.  */
3599           cached_vf_per_mode[last_mode_i]
3600             = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
3601                          loop_vinfo->suggested_unroll_factor);
3602           /* Once we hit the desired simdlen for the first time,
3603              discard any previous attempts.  */
3604           if (simdlen
3605               && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3606             {
3607               delete first_loop_vinfo;
3608               first_loop_vinfo = opt_loop_vec_info::success (NULL);
3609               simdlen = 0;
3610             }
3611           else if (pick_lowest_cost_p
3612                    && first_loop_vinfo
3613                    && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3614             {
3615               /* Pick loop_vinfo over first_loop_vinfo.  */
3616               delete first_loop_vinfo;
3617               first_loop_vinfo = opt_loop_vec_info::success (NULL);
3618             }
3619           if (first_loop_vinfo == NULL)
3620             first_loop_vinfo = loop_vinfo;
3621           else
3622             {
3623               delete loop_vinfo;
3624               loop_vinfo = opt_loop_vec_info::success (NULL);
3625             }
3626
3627           /* Commit to first_loop_vinfo if we have no reason to try
3628              alternatives.  */
3629           if (!simdlen && !pick_lowest_cost_p)
3630             break;
3631         }
3632       if (mode_i == vector_modes.length ()
3633           || autodetected_vector_mode == VOIDmode)
3634         break;
3635
3636       /* Try the next biggest vector size.  */
3637       if (dump_enabled_p ())
3638         dump_printf_loc (MSG_NOTE, vect_location,
3639                          "***** Re-trying analysis with vector mode %s\n",
3640                          GET_MODE_NAME (vector_modes[mode_i]));
3641     }
3642   if (!first_loop_vinfo)
3643     return opt_loop_vec_info::propagate_failure (res);
3644
3645   if (dump_enabled_p ())
3646     dump_printf_loc (MSG_NOTE, vect_location,
3647                      "***** Choosing vector mode %s\n",
3648                      GET_MODE_NAME (first_loop_vinfo->vector_mode));
3649
3650   /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3651      enabled, SIMDUID is not set, it is the innermost loop and we have
3652      either already found the loop's SIMDLEN or there was no SIMDLEN to
3653      begin with.
3654      TODO: Enable epilogue vectorization for loops with SIMDUID set.  */
3655   bool vect_epilogues = (!simdlen
3656                          && loop->inner == NULL
3657                          && param_vect_epilogues_nomask
3658                          && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3659                          && !loop->simduid);
3660   if (!vect_epilogues)
3661     return first_loop_vinfo;
3662
3663   /* Now analyze first_loop_vinfo for epilogue vectorization.  */
3664   poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3665
3666   /* For epilogues start the analysis from the first mode.  The motivation
3667      behind starting from the beginning comes from cases where the VECTOR_MODES
3668      array may contain length-agnostic and length-specific modes.  Their
3669      ordering is not guaranteed, so we could end up picking a mode for the main
3670      loop that is after the epilogue's optimal mode.  */
3671   vector_modes[0] = autodetected_vector_mode;
3672   mode_i = 0;
3673
3674   bool supports_partial_vectors =
3675     partial_vectors_supported_p () && param_vect_partial_vector_usage != 0;
3676   poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3677
3678   while (1)
3679     {
3680       /* If the target does not support partial vectors we can shorten the
3681          number of modes to analyze for the epilogue as we know we can't pick a
3682          mode that would lead to a VF at least as big as the
3683          FIRST_VINFO_VF.  */
3684       if (!supports_partial_vectors
3685           && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3686         {
3687           mode_i++;
3688           if (mode_i == vector_modes.length ())
3689             break;
3690           continue;
3691         }
3692
3693       if (dump_enabled_p ())
3694         dump_printf_loc (MSG_NOTE, vect_location,
3695                          "***** Re-trying epilogue analysis with vector "
3696                          "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3697
3698       bool fatal;
3699       opt_loop_vec_info loop_vinfo
3700         = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3701                                first_loop_vinfo,
3702                                vector_modes, mode_i,
3703                                autodetected_vector_mode, fatal);
3704       if (fatal)
3705         break;
3706
3707       if (loop_vinfo)
3708         {
3709           if (pick_lowest_cost_p)
3710             {
3711               /* Keep trying to roll back vectorization attempts while the
3712                  loop_vec_infos they produced were worse than this one.  */
3713               vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3714               while (!vinfos.is_empty ()
3715                      && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3716                 {
3717                   gcc_assert (vect_epilogues);
3718                   delete vinfos.pop ();
3719                 }
3720             }
3721           /* For now only allow one epilogue loop.  */
3722           if (first_loop_vinfo->epilogue_vinfos.is_empty ())
3723             {
3724               first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3725               poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3726               gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3727                           || maybe_ne (lowest_th, 0U));
3728               /* Keep track of the known smallest versioning
3729                  threshold.  */
3730               if (ordered_p (lowest_th, th))
3731                 lowest_th = ordered_min (lowest_th, th);
3732             }
3733           else
3734             {
3735               delete loop_vinfo;
3736               loop_vinfo = opt_loop_vec_info::success (NULL);
3737             }
3738
3739           /* For now only allow one epilogue loop, but allow
3740              pick_lowest_cost_p to replace it, so commit to the
3741              first epilogue if we have no reason to try alternatives.  */
3742           if (!pick_lowest_cost_p)
3743             break;
3744         }
3745
3746       if (mode_i == vector_modes.length ())
3747         break;
3748
3749     }
3750
3751   if (!first_loop_vinfo->epilogue_vinfos.is_empty ())
3752     {
3753       LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3754       if (dump_enabled_p ())
3755         dump_printf_loc (MSG_NOTE, vect_location,
3756                          "***** Choosing epilogue vector mode %s\n",
3757                          GET_MODE_NAME
3758                            (first_loop_vinfo->epilogue_vinfos[0]->vector_mode));
3759     }
3760
3761   return first_loop_vinfo;
3762 }
3763
3764 /* Return true if there is an in-order reduction function for CODE, storing
3765    it in *REDUC_FN if so.  */
3766
3767 static bool
3768 fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3769 {
3770   /* We support MINUS_EXPR by negating the operand.  This also preserves an
3771      initial -0.0 since -0.0 - 0.0 (neutral op for MINUS_EXPR) == -0.0 +
3772      (-0.0) = -0.0.  */
3773   if (code == PLUS_EXPR || code == MINUS_EXPR)
3774     {
3775       *reduc_fn = IFN_FOLD_LEFT_PLUS;
3776       return true;
3777     }
3778   return false;
3779 }
3780
3781 /* Function reduction_fn_for_scalar_code
3782
3783    Input:
3784    CODE - tree_code of a reduction operations.
3785
3786    Output:
3787    REDUC_FN - the corresponding internal function to be used to reduce the
3788       vector of partial results into a single scalar result, or IFN_LAST
3789       if the operation is a supported reduction operation, but does not have
3790       such an internal function.
3791
3792    Return FALSE if CODE currently cannot be vectorized as reduction.  */
3793
3794 bool
3795 reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3796 {
3797   if (code.is_tree_code ())
3798     switch (tree_code (code))
3799       {
3800       case MAX_EXPR:
3801         *reduc_fn = IFN_REDUC_MAX;
3802         return true;
3803
3804       case MIN_EXPR:
3805         *reduc_fn = IFN_REDUC_MIN;
3806         return true;
3807
3808       case PLUS_EXPR:
3809         *reduc_fn = IFN_REDUC_PLUS;
3810         return true;
3811
3812       case BIT_AND_EXPR:
3813         *reduc_fn = IFN_REDUC_AND;
3814         return true;
3815
3816       case BIT_IOR_EXPR:
3817         *reduc_fn = IFN_REDUC_IOR;
3818         return true;
3819
3820       case BIT_XOR_EXPR:
3821         *reduc_fn = IFN_REDUC_XOR;
3822         return true;
3823
3824       case MULT_EXPR:
3825       case MINUS_EXPR:
3826         *reduc_fn = IFN_LAST;
3827         return true;
3828
3829       default:
3830         return false;
3831       }
3832   else
3833     switch (combined_fn (code))
3834       {
3835       CASE_CFN_FMAX:
3836         *reduc_fn = IFN_REDUC_FMAX;
3837         return true;
3838
3839       CASE_CFN_FMIN:
3840         *reduc_fn = IFN_REDUC_FMIN;
3841         return true;
3842
3843       default:
3844         return false;
3845       }
3846 }
3847
3848 /* If there is a neutral value X such that a reduction would not be affected
3849    by the introduction of additional X elements, return that X, otherwise
3850    return null.  CODE is the code of the reduction and SCALAR_TYPE is type
3851    of the scalar elements.  If the reduction has just a single initial value
3852    then INITIAL_VALUE is that value, otherwise it is null.
3853    If AS_INITIAL is TRUE the value is supposed to be used as initial value.
3854    In that case no signed zero is returned.  */
3855
3856 tree
3857 neutral_op_for_reduction (tree scalar_type, code_helper code,
3858                           tree initial_value, bool as_initial)
3859 {
3860   if (code.is_tree_code ())
3861     switch (tree_code (code))
3862       {
3863       case DOT_PROD_EXPR:
3864       case SAD_EXPR:
3865       case MINUS_EXPR:
3866       case BIT_IOR_EXPR:
3867       case BIT_XOR_EXPR:
3868         return build_zero_cst (scalar_type);
3869       case WIDEN_SUM_EXPR:
3870       case PLUS_EXPR:
3871         if (!as_initial && HONOR_SIGNED_ZEROS (scalar_type))
3872           return build_real (scalar_type, dconstm0);
3873         else
3874           return build_zero_cst (scalar_type);
3875
3876       case MULT_EXPR:
3877         return build_one_cst (scalar_type);
3878
3879       case BIT_AND_EXPR:
3880         return build_all_ones_cst (scalar_type);
3881
3882       case MAX_EXPR:
3883       case MIN_EXPR:
3884         return initial_value;
3885
3886       default:
3887         return NULL_TREE;
3888       }
3889   else
3890     switch (combined_fn (code))
3891       {
3892       CASE_CFN_FMIN:
3893       CASE_CFN_FMAX:
3894         return initial_value;
3895
3896       default:
3897         return NULL_TREE;
3898       }
3899 }
3900
3901 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
3902    STMT is printed with a message MSG. */
3903
3904 static void
3905 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3906 {
3907   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3908 }
3909
3910 /* Return true if we need an in-order reduction for operation CODE
3911    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3912    overflow must wrap.  */
3913
3914 bool
3915 needs_fold_left_reduction_p (tree type, code_helper code)
3916 {
3917   /* CHECKME: check for !flag_finite_math_only too?  */
3918   if (SCALAR_FLOAT_TYPE_P (type))
3919     {
3920       if (code.is_tree_code ())
3921         switch (tree_code (code))
3922           {
3923           case MIN_EXPR:
3924           case MAX_EXPR:
3925             return false;
3926
3927           default:
3928             return !flag_associative_math;
3929           }
3930       else
3931         switch (combined_fn (code))
3932           {
3933           CASE_CFN_FMIN:
3934           CASE_CFN_FMAX:
3935             return false;
3936
3937           default:
3938             return !flag_associative_math;
3939           }
3940     }
3941
3942   if (INTEGRAL_TYPE_P (type))
3943     return (!code.is_tree_code ()
3944             || !operation_no_trapping_overflow (type, tree_code (code)));
3945
3946   if (SAT_FIXED_POINT_TYPE_P (type))
3947     return true;
3948
3949   return false;
3950 }
3951
3952 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3953    has a handled computation expression.  Store the main reduction
3954    operation in *CODE.  */
3955
3956 static bool
3957 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3958                       tree loop_arg, code_helper *code,
3959                       vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3960 {
3961   auto_bitmap visited;
3962   tree lookfor = PHI_RESULT (phi);
3963   ssa_op_iter curri;
3964   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3965   while (USE_FROM_PTR (curr) != loop_arg)
3966     curr = op_iter_next_use (&curri);
3967   curri.i = curri.numops;
3968   do
3969     {
3970       path.safe_push (std::make_pair (curri, curr));
3971       tree use = USE_FROM_PTR (curr);
3972       if (use == lookfor)
3973         break;
3974       gimple *def = SSA_NAME_DEF_STMT (use);
3975       if (gimple_nop_p (def)
3976           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3977         {
3978 pop:
3979           do
3980             {
3981               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3982               curri = x.first;
3983               curr = x.second;
3984               do
3985                 curr = op_iter_next_use (&curri);
3986               /* Skip already visited or non-SSA operands (from iterating
3987                  over PHI args).  */
3988               while (curr != NULL_USE_OPERAND_P
3989                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3990                          || ! bitmap_set_bit (visited,
3991                                               SSA_NAME_VERSION
3992                                                 (USE_FROM_PTR (curr)))));
3993             }
3994           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3995           if (curr == NULL_USE_OPERAND_P)
3996             break;
3997         }
3998       else
3999         {
4000           if (gimple_code (def) == GIMPLE_PHI)
4001             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
4002           else
4003             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
4004           while (curr != NULL_USE_OPERAND_P
4005                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
4006                      || ! bitmap_set_bit (visited,
4007                                           SSA_NAME_VERSION
4008                                             (USE_FROM_PTR (curr)))))
4009             curr = op_iter_next_use (&curri);
4010           if (curr == NULL_USE_OPERAND_P)
4011             goto pop;
4012         }
4013     }
4014   while (1);
4015   if (dump_file && (dump_flags & TDF_DETAILS))
4016     {
4017       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
4018       unsigned i;
4019       std::pair<ssa_op_iter, use_operand_p> *x;
4020       FOR_EACH_VEC_ELT (path, i, x)
4021         dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
4022       dump_printf (MSG_NOTE, "\n");
4023     }
4024
4025   /* Check whether the reduction path detected is valid.  */
4026   bool fail = path.length () == 0;
4027   bool neg = false;
4028   int sign = -1;
4029   *code = ERROR_MARK;
4030   for (unsigned i = 1; i < path.length (); ++i)
4031     {
4032       gimple *use_stmt = USE_STMT (path[i].second);
4033       gimple_match_op op;
4034       if (!gimple_extract_op (use_stmt, &op))
4035         {
4036           fail = true;
4037           break;
4038         }
4039       unsigned int opi = op.num_ops;
4040       if (gassign *assign = dyn_cast<gassign *> (use_stmt))
4041         {
4042           /* The following make sure we can compute the operand index
4043              easily plus it mostly disallows chaining via COND_EXPR condition
4044              operands.  */
4045           for (opi = 0; opi < op.num_ops; ++opi)
4046             if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
4047               break;
4048         }
4049       else if (gcall *call = dyn_cast<gcall *> (use_stmt))
4050         {
4051           for (opi = 0; opi < op.num_ops; ++opi)
4052             if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
4053               break;
4054         }
4055       if (opi == op.num_ops)
4056         {
4057           fail = true;
4058           break;
4059         }
4060       op.code = canonicalize_code (op.code, op.type);
4061       if (op.code == MINUS_EXPR)
4062         {
4063           op.code = PLUS_EXPR;
4064           /* Track whether we negate the reduction value each iteration.  */
4065           if (op.ops[1] == op.ops[opi])
4066             neg = ! neg;
4067         }
4068       if (CONVERT_EXPR_CODE_P (op.code)
4069           && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
4070         ;
4071       else if (*code == ERROR_MARK)
4072         {
4073           *code = op.code;
4074           sign = TYPE_SIGN (op.type);
4075         }
4076       else if (op.code != *code)
4077         {
4078           fail = true;
4079           break;
4080         }
4081       else if ((op.code == MIN_EXPR
4082                 || op.code == MAX_EXPR)
4083                && sign != TYPE_SIGN (op.type))
4084         {
4085           fail = true;
4086           break;
4087         }
4088       /* Check there's only a single stmt the op is used on.  For the
4089          not value-changing tail and the last stmt allow out-of-loop uses.
4090          ???  We could relax this and handle arbitrary live stmts by
4091          forcing a scalar epilogue for example.  */
4092       imm_use_iterator imm_iter;
4093       use_operand_p use_p;
4094       gimple *op_use_stmt;
4095       unsigned cnt = 0;
4096       bool cond_fn_p = op.code.is_internal_fn ()
4097         && (conditional_internal_fn_code (internal_fn (op.code))
4098             != ERROR_MARK);
4099
4100       FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
4101         {
4102         /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
4103            op1 twice (once as definition, once as else) in the same operation.
4104            Allow this.  */
4105           if (cond_fn_p)
4106             {
4107               gcall *call = dyn_cast<gcall *> (use_stmt);
4108               unsigned else_pos
4109                 = internal_fn_else_index (internal_fn (op.code));
4110
4111               for (unsigned int j = 0; j < gimple_call_num_args (call); ++j)
4112                 {
4113                   if (j == else_pos)
4114                     continue;
4115                   if (gimple_call_arg (call, j) == op.ops[opi])
4116                     cnt++;
4117                 }
4118             }
4119           else if (!is_gimple_debug (op_use_stmt)
4120                    && (*code != ERROR_MARK
4121                        || flow_bb_inside_loop_p (loop,
4122                                                  gimple_bb (op_use_stmt))))
4123             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4124               cnt++;
4125         }
4126
4127       if (cnt != 1)
4128         {
4129           fail = true;
4130           break;
4131         }
4132     }
4133   return ! fail && ! neg && *code != ERROR_MARK;
4134 }
4135
4136 bool
4137 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
4138                       tree loop_arg, enum tree_code code)
4139 {
4140   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4141   code_helper code_;
4142   return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
4143           && code_ == code);
4144 }
4145
4146
4147
4148 /* Function vect_is_simple_reduction
4149
4150    (1) Detect a cross-iteration def-use cycle that represents a simple
4151    reduction computation.  We look for the following pattern:
4152
4153    loop_header:
4154      a1 = phi < a0, a2 >
4155      a3 = ...
4156      a2 = operation (a3, a1)
4157
4158    or
4159
4160    a3 = ...
4161    loop_header:
4162      a1 = phi < a0, a2 >
4163      a2 = operation (a3, a1)
4164
4165    such that:
4166    1. operation is commutative and associative and it is safe to
4167       change the order of the computation
4168    2. no uses for a2 in the loop (a2 is used out of the loop)
4169    3. no uses of a1 in the loop besides the reduction operation
4170    4. no uses of a1 outside the loop.
4171
4172    Conditions 1,4 are tested here.
4173    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
4174
4175    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
4176    nested cycles.
4177
4178    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
4179    reductions:
4180
4181      a1 = phi < a0, a2 >
4182      inner loop (def of a3)
4183      a2 = phi < a3 >
4184
4185    (4) Detect condition expressions, ie:
4186      for (int i = 0; i < N; i++)
4187        if (a[i] < val)
4188         ret_val = a[i];
4189
4190 */
4191
4192 static stmt_vec_info
4193 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
4194                           bool *double_reduc, bool *reduc_chain_p, bool slp)
4195 {
4196   gphi *phi = as_a <gphi *> (phi_info->stmt);
4197   gimple *phi_use_stmt = NULL;
4198   imm_use_iterator imm_iter;
4199   use_operand_p use_p;
4200
4201   *double_reduc = false;
4202   *reduc_chain_p = false;
4203   STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
4204
4205   tree phi_name = PHI_RESULT (phi);
4206   /* ???  If there are no uses of the PHI result the inner loop reduction
4207      won't be detected as possibly double-reduction by vectorizable_reduction
4208      because that tries to walk the PHI arg from the preheader edge which
4209      can be constant.  See PR60382.  */
4210   if (has_zero_uses (phi_name))
4211     return NULL;
4212   class loop *loop = (gimple_bb (phi))->loop_father;
4213   unsigned nphi_def_loop_uses = 0;
4214   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
4215     {
4216       gimple *use_stmt = USE_STMT (use_p);
4217       if (is_gimple_debug (use_stmt))
4218         continue;
4219
4220       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4221         {
4222           if (dump_enabled_p ())
4223             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4224                              "intermediate value used outside loop.\n");
4225
4226           return NULL;
4227         }
4228
4229       /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
4230          op1 twice (once as definition, once as else) in the same operation.
4231          Only count it as one. */
4232       if (use_stmt != phi_use_stmt)
4233         {
4234           nphi_def_loop_uses++;
4235           phi_use_stmt = use_stmt;
4236         }
4237     }
4238
4239   tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
4240   if (TREE_CODE (latch_def) != SSA_NAME)
4241     {
4242       if (dump_enabled_p ())
4243         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4244                          "reduction: not ssa_name: %T\n", latch_def);
4245       return NULL;
4246     }
4247
4248   stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
4249   if (!def_stmt_info
4250       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
4251     return NULL;
4252
4253   bool nested_in_vect_loop
4254     = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
4255   unsigned nlatch_def_loop_uses = 0;
4256   auto_vec<gphi *, 3> lcphis;
4257   bool inner_loop_of_double_reduc = false;
4258   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
4259     {
4260       gimple *use_stmt = USE_STMT (use_p);
4261       if (is_gimple_debug (use_stmt))
4262         continue;
4263       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4264         nlatch_def_loop_uses++;
4265       else
4266         {
4267           /* We can have more than one loop-closed PHI.  */
4268           lcphis.safe_push (as_a <gphi *> (use_stmt));
4269           if (nested_in_vect_loop
4270               && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
4271                   == vect_double_reduction_def))
4272             inner_loop_of_double_reduc = true;
4273         }
4274     }
4275
4276   /* If we are vectorizing an inner reduction we are executing that
4277      in the original order only in case we are not dealing with a
4278      double reduction.  */
4279   if (nested_in_vect_loop && !inner_loop_of_double_reduc)
4280     {
4281       if (dump_enabled_p ())
4282         report_vect_op (MSG_NOTE, def_stmt_info->stmt,
4283                         "detected nested cycle: ");
4284       return def_stmt_info;
4285     }
4286
4287   /* When the inner loop of a double reduction ends up with more than
4288      one loop-closed PHI we have failed to classify alternate such
4289      PHIs as double reduction, leading to wrong code.  See PR103237.  */
4290   if (inner_loop_of_double_reduc && lcphis.length () != 1)
4291     {
4292       if (dump_enabled_p ())
4293         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4294                          "unhandle double reduction\n");
4295       return NULL;
4296     }
4297
4298   /* If this isn't a nested cycle or if the nested cycle reduction value
4299      is used ouside of the inner loop we cannot handle uses of the reduction
4300      value.  */
4301   if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
4302     {
4303       if (dump_enabled_p ())
4304         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4305                          "reduction used in loop.\n");
4306       return NULL;
4307     }
4308
4309   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
4310      defined in the inner loop.  */
4311   if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
4312     {
4313       tree op1 = PHI_ARG_DEF (def_stmt, 0);
4314       if (gimple_phi_num_args (def_stmt) != 1
4315           || TREE_CODE (op1) != SSA_NAME)
4316         {
4317           if (dump_enabled_p ())
4318             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4319                              "unsupported phi node definition.\n");
4320
4321           return NULL;
4322         }
4323
4324       /* Verify there is an inner cycle composed of the PHI phi_use_stmt
4325          and the latch definition op1.  */
4326       gimple *def1 = SSA_NAME_DEF_STMT (op1);
4327       if (gimple_bb (def1)
4328           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4329           && loop->inner
4330           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
4331           && (is_gimple_assign (def1) || is_gimple_call (def1))
4332           && is_a <gphi *> (phi_use_stmt)
4333           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt))
4334           && (op1 == PHI_ARG_DEF_FROM_EDGE (phi_use_stmt,
4335                                             loop_latch_edge (loop->inner))))
4336         {
4337           if (dump_enabled_p ())
4338             report_vect_op (MSG_NOTE, def_stmt,
4339                             "detected double reduction: ");
4340
4341           *double_reduc = true;
4342           return def_stmt_info;
4343         }
4344
4345       return NULL;
4346     }
4347
4348   /* Look for the expression computing latch_def from then loop PHI result.  */
4349   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4350   code_helper code;
4351   if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
4352                             path))
4353     {
4354       STMT_VINFO_REDUC_CODE (phi_info) = code;
4355       if (code == COND_EXPR && !nested_in_vect_loop)
4356         STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
4357
4358       /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
4359          reduction chain for which the additional restriction is that
4360          all operations in the chain are the same.  */
4361       auto_vec<stmt_vec_info, 8> reduc_chain;
4362       unsigned i;
4363       bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
4364       for (i = path.length () - 1; i >= 1; --i)
4365         {
4366           gimple *stmt = USE_STMT (path[i].second);
4367           stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
4368           gimple_match_op op;
4369           if (!gimple_extract_op (stmt, &op))
4370             gcc_unreachable ();
4371           if (gassign *assign = dyn_cast<gassign *> (stmt))
4372             STMT_VINFO_REDUC_IDX (stmt_info)
4373               = path[i].second->use - gimple_assign_rhs1_ptr (assign);
4374           else
4375             {
4376               gcall *call = as_a<gcall *> (stmt);
4377               STMT_VINFO_REDUC_IDX (stmt_info)
4378                 = path[i].second->use - gimple_call_arg_ptr (call, 0);
4379             }
4380           bool leading_conversion = (CONVERT_EXPR_CODE_P (op.code)
4381                                      && (i == 1 || i == path.length () - 1));
4382           if ((op.code != code && !leading_conversion)
4383               /* We can only handle the final value in epilogue
4384                  generation for reduction chains.  */
4385               || (i != 1 && !has_single_use (gimple_get_lhs (stmt))))
4386             is_slp_reduc = false;
4387           /* For reduction chains we support a trailing/leading
4388              conversions.  We do not store those in the actual chain.  */
4389           if (leading_conversion)
4390             continue;
4391           reduc_chain.safe_push (stmt_info);
4392         }
4393       if (slp && is_slp_reduc && reduc_chain.length () > 1)
4394         {
4395           for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
4396             {
4397               REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
4398               REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
4399             }
4400           REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
4401           REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
4402
4403           /* Save the chain for further analysis in SLP detection.  */
4404           LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
4405           REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
4406
4407           *reduc_chain_p = true;
4408           if (dump_enabled_p ())
4409             dump_printf_loc (MSG_NOTE, vect_location,
4410                             "reduction: detected reduction chain\n");
4411         }
4412       else if (dump_enabled_p ())
4413         dump_printf_loc (MSG_NOTE, vect_location,
4414                          "reduction: detected reduction\n");
4415
4416       return def_stmt_info;
4417     }
4418
4419   if (dump_enabled_p ())
4420     dump_printf_loc (MSG_NOTE, vect_location,
4421                      "reduction: unknown pattern\n");
4422
4423   return NULL;
4424 }
4425
4426 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
4427    PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
4428    or -1 if not known.  */
4429
4430 static int
4431 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
4432 {
4433   int assumed_vf = vect_vf_for_cost (loop_vinfo);
4434   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
4435     {
4436       if (dump_enabled_p ())
4437         dump_printf_loc (MSG_NOTE, vect_location,
4438                          "cost model: epilogue peel iters set to vf/2 "
4439                          "because loop iterations are unknown .\n");
4440       return assumed_vf / 2;
4441     }
4442   else
4443     {
4444       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
4445       peel_iters_prologue = MIN (niters, peel_iters_prologue);
4446       int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
4447       /* If we need to peel for gaps, but no peeling is required, we have to
4448          peel VF iterations.  */
4449       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
4450         peel_iters_epilogue = assumed_vf;
4451       return peel_iters_epilogue;
4452     }
4453 }
4454
4455 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
4456 int
4457 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
4458                              int *peel_iters_epilogue,
4459                              stmt_vector_for_cost *scalar_cost_vec,
4460                              stmt_vector_for_cost *prologue_cost_vec,
4461                              stmt_vector_for_cost *epilogue_cost_vec)
4462 {
4463   int retval = 0;
4464
4465   *peel_iters_epilogue
4466     = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
4467
4468   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
4469     {
4470       /* If peeled iterations are known but number of scalar loop
4471          iterations are unknown, count a taken branch per peeled loop.  */
4472       if (peel_iters_prologue > 0)
4473         retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
4474                                    vect_prologue);
4475       if (*peel_iters_epilogue > 0)
4476         retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
4477                                     vect_epilogue);
4478     }
4479
4480   stmt_info_for_cost *si;
4481   int j;
4482   if (peel_iters_prologue)
4483     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4484       retval += record_stmt_cost (prologue_cost_vec,
4485                                   si->count * peel_iters_prologue,
4486                                   si->kind, si->stmt_info, si->misalign,
4487                                   vect_prologue);
4488   if (*peel_iters_epilogue)
4489     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4490       retval += record_stmt_cost (epilogue_cost_vec,
4491                                   si->count * *peel_iters_epilogue,
4492                                   si->kind, si->stmt_info, si->misalign,
4493                                   vect_epilogue);
4494
4495   return retval;
4496 }
4497
4498 /* Function vect_estimate_min_profitable_iters
4499
4500    Return the number of iterations required for the vector version of the
4501    loop to be profitable relative to the cost of the scalar version of the
4502    loop.
4503
4504    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
4505    of iterations for vectorization.  -1 value means loop vectorization
4506    is not profitable.  This returned value may be used for dynamic
4507    profitability check.
4508
4509    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
4510    for static check against estimated number of iterations.  */
4511
4512 static void
4513 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
4514                                     int *ret_min_profitable_niters,
4515                                     int *ret_min_profitable_estimate,
4516                                     unsigned *suggested_unroll_factor)
4517 {
4518   int min_profitable_iters;
4519   int min_profitable_estimate;
4520   int peel_iters_prologue;
4521   int peel_iters_epilogue;
4522   unsigned vec_inside_cost = 0;
4523   int vec_outside_cost = 0;
4524   unsigned vec_prologue_cost = 0;
4525   unsigned vec_epilogue_cost = 0;
4526   int scalar_single_iter_cost = 0;
4527   int scalar_outside_cost = 0;
4528   int assumed_vf = vect_vf_for_cost (loop_vinfo);
4529   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
4530   vector_costs *target_cost_data = loop_vinfo->vector_costs;
4531
4532   /* Cost model disabled.  */
4533   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
4534     {
4535       if (dump_enabled_p ())
4536         dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
4537       *ret_min_profitable_niters = 0;
4538       *ret_min_profitable_estimate = 0;
4539       return;
4540     }
4541
4542   /* Requires loop versioning tests to handle misalignment.  */
4543   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
4544     {
4545       /*  FIXME: Make cost depend on complexity of individual check.  */
4546       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
4547       (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4548       if (dump_enabled_p ())
4549         dump_printf (MSG_NOTE,
4550                      "cost model: Adding cost of checks for loop "
4551                      "versioning to treat misalignment.\n");
4552     }
4553
4554   /* Requires loop versioning with alias checks.  */
4555   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4556     {
4557       /*  FIXME: Make cost depend on complexity of individual check.  */
4558       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
4559       (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4560       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
4561       if (len)
4562         /* Count LEN - 1 ANDs and LEN comparisons.  */
4563         (void) add_stmt_cost (target_cost_data, len * 2 - 1,
4564                               scalar_stmt, vect_prologue);
4565       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
4566       if (len)
4567         {
4568           /* Count LEN - 1 ANDs and LEN comparisons.  */
4569           unsigned int nstmts = len * 2 - 1;
4570           /* +1 for each bias that needs adding.  */
4571           for (unsigned int i = 0; i < len; ++i)
4572             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
4573               nstmts += 1;
4574           (void) add_stmt_cost (target_cost_data, nstmts,
4575                                 scalar_stmt, vect_prologue);
4576         }
4577       if (dump_enabled_p ())
4578         dump_printf (MSG_NOTE,
4579                      "cost model: Adding cost of checks for loop "
4580                      "versioning aliasing.\n");
4581     }
4582
4583   /* Requires loop versioning with niter checks.  */
4584   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
4585     {
4586       /*  FIXME: Make cost depend on complexity of individual check.  */
4587       (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
4588                             NULL, NULL, NULL_TREE, 0, vect_prologue);
4589       if (dump_enabled_p ())
4590         dump_printf (MSG_NOTE,
4591                      "cost model: Adding cost of checks for loop "
4592                      "versioning niters.\n");
4593     }
4594
4595   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4596     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4597                           vect_prologue);
4598
4599   /* Count statements in scalar loop.  Using this as scalar cost for a single
4600      iteration for now.
4601
4602      TODO: Add outer loop support.
4603
4604      TODO: Consider assigning different costs to different scalar
4605      statements.  */
4606
4607   scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost ();
4608
4609   /* Add additional cost for the peeled instructions in prologue and epilogue
4610      loop.  (For fully-masked loops there will be no peeling.)
4611
4612      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4613      at compile-time - we assume it's vf/2 (the worst would be vf-1).
4614
4615      TODO: Build an expression that represents peel_iters for prologue and
4616      epilogue to be used in a run-time test.  */
4617
4618   bool prologue_need_br_taken_cost = false;
4619   bool prologue_need_br_not_taken_cost = false;
4620
4621   /* Calculate peel_iters_prologue.  */
4622   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4623     peel_iters_prologue = 0;
4624   else if (npeel < 0)
4625     {
4626       peel_iters_prologue = assumed_vf / 2;
4627       if (dump_enabled_p ())
4628         dump_printf (MSG_NOTE, "cost model: "
4629                      "prologue peel iters set to vf/2.\n");
4630
4631       /* If peeled iterations are unknown, count a taken branch and a not taken
4632          branch per peeled loop.  Even if scalar loop iterations are known,
4633          vector iterations are not known since peeled prologue iterations are
4634          not known.  Hence guards remain the same.  */
4635       prologue_need_br_taken_cost = true;
4636       prologue_need_br_not_taken_cost = true;
4637     }
4638   else
4639     {
4640       peel_iters_prologue = npeel;
4641       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4642         /* If peeled iterations are known but number of scalar loop
4643            iterations are unknown, count a taken branch per peeled loop.  */
4644         prologue_need_br_taken_cost = true;
4645     }
4646
4647   bool epilogue_need_br_taken_cost = false;
4648   bool epilogue_need_br_not_taken_cost = false;
4649
4650   /* Calculate peel_iters_epilogue.  */
4651   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4652     /* We need to peel exactly one iteration for gaps.  */
4653     peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4654   else if (npeel < 0)
4655     {
4656       /* If peeling for alignment is unknown, loop bound of main loop
4657          becomes unknown.  */
4658       peel_iters_epilogue = assumed_vf / 2;
4659       if (dump_enabled_p ())
4660         dump_printf (MSG_NOTE, "cost model: "
4661                      "epilogue peel iters set to vf/2 because "
4662                      "peeling for alignment is unknown.\n");
4663
4664       /* See the same reason above in peel_iters_prologue calculation.  */
4665       epilogue_need_br_taken_cost = true;
4666       epilogue_need_br_not_taken_cost = true;
4667     }
4668   else
4669     {
4670       peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4671       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4672         /* If peeled iterations are known but number of scalar loop
4673            iterations are unknown, count a taken branch per peeled loop.  */
4674         epilogue_need_br_taken_cost = true;
4675     }
4676
4677   stmt_info_for_cost *si;
4678   int j;
4679   /* Add costs associated with peel_iters_prologue.  */
4680   if (peel_iters_prologue)
4681     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4682       {
4683         (void) add_stmt_cost (target_cost_data,
4684                               si->count * peel_iters_prologue, si->kind,
4685                               si->stmt_info, si->node, si->vectype,
4686                               si->misalign, vect_prologue);
4687       }
4688
4689   /* Add costs associated with peel_iters_epilogue.  */
4690   if (peel_iters_epilogue)
4691     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4692       {
4693         (void) add_stmt_cost (target_cost_data,
4694                               si->count * peel_iters_epilogue, si->kind,
4695                               si->stmt_info, si->node, si->vectype,
4696                               si->misalign, vect_epilogue);
4697       }
4698
4699   /* Add possible cond_branch_taken/cond_branch_not_taken cost.  */
4700
4701   if (prologue_need_br_taken_cost)
4702     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4703                           vect_prologue);
4704
4705   if (prologue_need_br_not_taken_cost)
4706     (void) add_stmt_cost (target_cost_data, 1,
4707                           cond_branch_not_taken, vect_prologue);
4708
4709   if (epilogue_need_br_taken_cost)
4710     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4711                           vect_epilogue);
4712
4713   if (epilogue_need_br_not_taken_cost)
4714     (void) add_stmt_cost (target_cost_data, 1,
4715                           cond_branch_not_taken, vect_epilogue);
4716
4717   /* Take care of special costs for rgroup controls of partial vectors.  */
4718   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4719       && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4720           == vect_partial_vectors_avx512))
4721     {
4722       /* Calculate how many masks we need to generate.  */
4723       unsigned int num_masks = 0;
4724       bool need_saturation = false;
4725       for (auto rgm : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
4726         if (rgm.type)
4727           {
4728             unsigned nvectors = rgm.factor;
4729             num_masks += nvectors;
4730             if (TYPE_PRECISION (TREE_TYPE (rgm.compare_type))
4731                 < TYPE_PRECISION (LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo)))
4732               need_saturation = true;
4733           }
4734
4735       /* ???  The target isn't able to identify the costs below as
4736          producing masks so it cannot penaltize cases where we'd run
4737          out of mask registers for example.  */
4738
4739       /* ???  We are also failing to account for smaller vector masks
4740          we generate by splitting larger masks in vect_get_loop_mask.  */
4741
4742       /* In the worst case, we need to generate each mask in the prologue
4743          and in the loop body.  We need one splat per group and one
4744          compare per mask.
4745
4746          Sometimes the prologue mask will fold to a constant,
4747          so the actual prologue cost might be smaller.  However, it's
4748          simpler and safer to use the worst-case cost; if this ends up
4749          being the tie-breaker between vectorizing or not, then it's
4750          probably better not to vectorize.  */
4751       (void) add_stmt_cost (target_cost_data,
4752                             num_masks
4753                             + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4754                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4755                             vect_prologue);
4756       (void) add_stmt_cost (target_cost_data,
4757                             num_masks
4758                             + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4759                             vector_stmt, NULL, NULL, NULL_TREE, 0, vect_body);
4760
4761       /* When we need saturation we need it both in the prologue and
4762          the epilogue.  */
4763       if (need_saturation)
4764         {
4765           (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4766                                 NULL, NULL, NULL_TREE, 0, vect_prologue);
4767           (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4768                                 NULL, NULL, NULL_TREE, 0, vect_body);
4769         }
4770     }
4771   else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4772            && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4773                == vect_partial_vectors_while_ult))
4774     {
4775       /* Calculate how many masks we need to generate.  */
4776       unsigned int num_masks = 0;
4777       rgroup_controls *rgm;
4778       unsigned int num_vectors_m1;
4779       FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec,
4780                         num_vectors_m1, rgm)
4781         if (rgm->type)
4782           num_masks += num_vectors_m1 + 1;
4783       gcc_assert (num_masks > 0);
4784
4785       /* In the worst case, we need to generate each mask in the prologue
4786          and in the loop body.  One of the loop body mask instructions
4787          replaces the comparison in the scalar loop, and since we don't
4788          count the scalar comparison against the scalar body, we shouldn't
4789          count that vector instruction against the vector body either.
4790
4791          Sometimes we can use unpacks instead of generating prologue
4792          masks and sometimes the prologue mask will fold to a constant,
4793          so the actual prologue cost might be smaller.  However, it's
4794          simpler and safer to use the worst-case cost; if this ends up
4795          being the tie-breaker between vectorizing or not, then it's
4796          probably better not to vectorize.  */
4797       (void) add_stmt_cost (target_cost_data, num_masks,
4798                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4799                             vect_prologue);
4800       (void) add_stmt_cost (target_cost_data, num_masks - 1,
4801                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4802                             vect_body);
4803     }
4804   else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4805     {
4806       /* Referring to the functions vect_set_loop_condition_partial_vectors
4807          and vect_set_loop_controls_directly, we need to generate each
4808          length in the prologue and in the loop body if required. Although
4809          there are some possible optimizations, we consider the worst case
4810          here.  */
4811
4812       bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4813       signed char partial_load_store_bias
4814         = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4815       bool need_iterate_p
4816         = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4817            && !vect_known_niters_smaller_than_vf (loop_vinfo));
4818
4819       /* Calculate how many statements to be added.  */
4820       unsigned int prologue_stmts = 0;
4821       unsigned int body_stmts = 0;
4822
4823       rgroup_controls *rgc;
4824       unsigned int num_vectors_m1;
4825       FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4826         if (rgc->type)
4827           {
4828             /* May need one SHIFT for nitems_total computation.  */
4829             unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4830             if (nitems != 1 && !niters_known_p)
4831               prologue_stmts += 1;
4832
4833             /* May need one MAX and one MINUS for wrap around.  */
4834             if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4835               prologue_stmts += 2;
4836
4837             /* Need one MAX and one MINUS for each batch limit excepting for
4838                the 1st one.  */
4839             prologue_stmts += num_vectors_m1 * 2;
4840
4841             unsigned int num_vectors = num_vectors_m1 + 1;
4842
4843             /* Need to set up lengths in prologue, only one MIN required
4844                for each since start index is zero.  */
4845             prologue_stmts += num_vectors;
4846
4847             /* If we have a non-zero partial load bias, we need one PLUS
4848                to adjust the load length.  */
4849             if (partial_load_store_bias != 0)
4850               body_stmts += 1;
4851
4852             /* Each may need two MINs and one MINUS to update lengths in body
4853                for next iteration.  */
4854             if (need_iterate_p)
4855               body_stmts += 3 * num_vectors;
4856           }
4857
4858       (void) add_stmt_cost (target_cost_data, prologue_stmts,
4859                             scalar_stmt, vect_prologue);
4860       (void) add_stmt_cost (target_cost_data, body_stmts,
4861                             scalar_stmt, vect_body);
4862     }
4863
4864   /* FORNOW: The scalar outside cost is incremented in one of the
4865      following ways:
4866
4867      1. The vectorizer checks for alignment and aliasing and generates
4868      a condition that allows dynamic vectorization.  A cost model
4869      check is ANDED with the versioning condition.  Hence scalar code
4870      path now has the added cost of the versioning check.
4871
4872        if (cost > th & versioning_check)
4873          jmp to vector code
4874
4875      Hence run-time scalar is incremented by not-taken branch cost.
4876
4877      2. The vectorizer then checks if a prologue is required.  If the
4878      cost model check was not done before during versioning, it has to
4879      be done before the prologue check.
4880
4881        if (cost <= th)
4882          prologue = scalar_iters
4883        if (prologue == 0)
4884          jmp to vector code
4885        else
4886          execute prologue
4887        if (prologue == num_iters)
4888          go to exit
4889
4890      Hence the run-time scalar cost is incremented by a taken branch,
4891      plus a not-taken branch, plus a taken branch cost.
4892
4893      3. The vectorizer then checks if an epilogue is required.  If the
4894      cost model check was not done before during prologue check, it
4895      has to be done with the epilogue check.
4896
4897        if (prologue == 0)
4898          jmp to vector code
4899        else
4900          execute prologue
4901        if (prologue == num_iters)
4902          go to exit
4903        vector code:
4904          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4905            jmp to epilogue
4906
4907      Hence the run-time scalar cost should be incremented by 2 taken
4908      branches.
4909
4910      TODO: The back end may reorder the BBS's differently and reverse
4911      conditions/branch directions.  Change the estimates below to
4912      something more reasonable.  */
4913
4914   /* If the number of iterations is known and we do not do versioning, we can
4915      decide whether to vectorize at compile time.  Hence the scalar version
4916      do not carry cost model guard costs.  */
4917   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4918       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4919     {
4920       /* Cost model check occurs at versioning.  */
4921       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4922         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4923       else
4924         {
4925           /* Cost model check occurs at prologue generation.  */
4926           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4927             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4928               + vect_get_stmt_cost (cond_branch_not_taken);
4929           /* Cost model check occurs at epilogue generation.  */
4930           else
4931             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4932         }
4933     }
4934
4935   /* Complete the target-specific cost calculations.  */
4936   finish_cost (loop_vinfo->vector_costs, loop_vinfo->scalar_costs,
4937                &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost,
4938                suggested_unroll_factor);
4939
4940   if (suggested_unroll_factor && *suggested_unroll_factor > 1
4941       && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
4942       && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
4943                     *suggested_unroll_factor,
4944                     LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
4945     {
4946       if (dump_enabled_p ())
4947         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4948                          "can't unroll as unrolled vectorization factor larger"
4949                          " than maximum vectorization factor: "
4950                          HOST_WIDE_INT_PRINT_UNSIGNED "\n",
4951                          LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
4952       *suggested_unroll_factor = 1;
4953     }
4954
4955   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4956
4957   if (dump_enabled_p ())
4958     {
4959       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4960       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
4961                    vec_inside_cost);
4962       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
4963                    vec_prologue_cost);
4964       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
4965                    vec_epilogue_cost);
4966       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
4967                    scalar_single_iter_cost);
4968       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
4969                    scalar_outside_cost);
4970       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
4971                    vec_outside_cost);
4972       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
4973                    peel_iters_prologue);
4974       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
4975                    peel_iters_epilogue);
4976     }
4977
4978   /* Calculate number of iterations required to make the vector version
4979      profitable, relative to the loop bodies only.  The following condition
4980      must hold true:
4981      SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4982      where
4983      SIC = scalar iteration cost, VIC = vector iteration cost,
4984      VOC = vector outside cost, VF = vectorization factor,
4985      NPEEL = prologue iterations + epilogue iterations,
4986      SOC = scalar outside cost for run time cost model check.  */
4987
4988   int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4989                           - vec_inside_cost);
4990   if (saving_per_viter <= 0)
4991     {
4992       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4993         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4994                     "vectorization did not happen for a simd loop");
4995
4996       if (dump_enabled_p ())
4997         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4998                          "cost model: the vector iteration cost = %d "
4999                          "divided by the scalar iteration cost = %d "
5000                          "is greater or equal to the vectorization factor = %d"
5001                          ".\n",
5002                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
5003       *ret_min_profitable_niters = -1;
5004       *ret_min_profitable_estimate = -1;
5005       return;
5006     }
5007
5008   /* ??? The "if" arm is written to handle all cases; see below for what
5009      we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
5010   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5011     {
5012       /* Rewriting the condition above in terms of the number of
5013          vector iterations (vniters) rather than the number of
5014          scalar iterations (niters) gives:
5015
5016          SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
5017
5018          <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
5019
5020          For integer N, X and Y when X > 0:
5021
5022          N * X > Y <==> N >= (Y /[floor] X) + 1.  */
5023       int outside_overhead = (vec_outside_cost
5024                               - scalar_single_iter_cost * peel_iters_prologue
5025                               - scalar_single_iter_cost * peel_iters_epilogue
5026                               - scalar_outside_cost);
5027       /* We're only interested in cases that require at least one
5028          vector iteration.  */
5029       int min_vec_niters = 1;
5030       if (outside_overhead > 0)
5031         min_vec_niters = outside_overhead / saving_per_viter + 1;
5032
5033       if (dump_enabled_p ())
5034         dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
5035                      min_vec_niters);
5036
5037       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5038         {
5039           /* Now that we know the minimum number of vector iterations,
5040              find the minimum niters for which the scalar cost is larger:
5041
5042              SIC * niters > VIC * vniters + VOC - SOC
5043
5044              We know that the minimum niters is no more than
5045              vniters * VF + NPEEL, but it might be (and often is) less
5046              than that if a partial vector iteration is cheaper than the
5047              equivalent scalar code.  */
5048           int threshold = (vec_inside_cost * min_vec_niters
5049                            + vec_outside_cost
5050                            - scalar_outside_cost);
5051           if (threshold <= 0)
5052             min_profitable_iters = 1;
5053           else
5054             min_profitable_iters = threshold / scalar_single_iter_cost + 1;
5055         }
5056       else
5057         /* Convert the number of vector iterations into a number of
5058            scalar iterations.  */
5059         min_profitable_iters = (min_vec_niters * assumed_vf
5060                                 + peel_iters_prologue
5061                                 + peel_iters_epilogue);
5062     }
5063   else
5064     {
5065       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
5066                               * assumed_vf
5067                               - vec_inside_cost * peel_iters_prologue
5068                               - vec_inside_cost * peel_iters_epilogue);
5069       if (min_profitable_iters <= 0)
5070         min_profitable_iters = 0;
5071       else
5072         {
5073           min_profitable_iters /= saving_per_viter;
5074
5075           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
5076               <= (((int) vec_inside_cost * min_profitable_iters)
5077                   + (((int) vec_outside_cost - scalar_outside_cost)
5078                      * assumed_vf)))
5079             min_profitable_iters++;
5080         }
5081     }
5082
5083   if (dump_enabled_p ())
5084     dump_printf (MSG_NOTE,
5085                  "  Calculated minimum iters for profitability: %d\n",
5086                  min_profitable_iters);
5087
5088   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
5089       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
5090     /* We want the vectorized loop to execute at least once.  */
5091     min_profitable_iters = assumed_vf + peel_iters_prologue;
5092   else if (min_profitable_iters < peel_iters_prologue)
5093     /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
5094        vectorized loop executes at least once.  */
5095     min_profitable_iters = peel_iters_prologue;
5096
5097   if (dump_enabled_p ())
5098     dump_printf_loc (MSG_NOTE, vect_location,
5099                      "  Runtime profitability threshold = %d\n",
5100                      min_profitable_iters);
5101
5102   *ret_min_profitable_niters = min_profitable_iters;
5103
5104   /* Calculate number of iterations required to make the vector version
5105      profitable, relative to the loop bodies only.
5106
5107      Non-vectorized variant is SIC * niters and it must win over vector
5108      variant on the expected loop trip count.  The following condition must hold true:
5109      SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
5110
5111   if (vec_outside_cost <= 0)
5112     min_profitable_estimate = 0;
5113   /* ??? This "else if" arm is written to handle all cases; see below for
5114      what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
5115   else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5116     {
5117       /* This is a repeat of the code above, but with + SOC rather
5118          than - SOC.  */
5119       int outside_overhead = (vec_outside_cost
5120                               - scalar_single_iter_cost * peel_iters_prologue
5121                               - scalar_single_iter_cost * peel_iters_epilogue
5122                               + scalar_outside_cost);
5123       int min_vec_niters = 1;
5124       if (outside_overhead > 0)
5125         min_vec_niters = outside_overhead / saving_per_viter + 1;
5126
5127       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5128         {
5129           int threshold = (vec_inside_cost * min_vec_niters
5130                            + vec_outside_cost
5131                            + scalar_outside_cost);
5132           min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
5133         }
5134       else
5135         min_profitable_estimate = (min_vec_niters * assumed_vf
5136                                    + peel_iters_prologue
5137                                    + peel_iters_epilogue);
5138     }
5139   else
5140     {
5141       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
5142                                  * assumed_vf
5143                                  - vec_inside_cost * peel_iters_prologue
5144                                  - vec_inside_cost * peel_iters_epilogue)
5145                                  / ((scalar_single_iter_cost * assumed_vf)
5146                                    - vec_inside_cost);
5147     }
5148   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
5149   if (dump_enabled_p ())
5150     dump_printf_loc (MSG_NOTE, vect_location,
5151                      "  Static estimate profitability threshold = %d\n",
5152                      min_profitable_estimate);
5153
5154   *ret_min_profitable_estimate = min_profitable_estimate;
5155 }
5156
5157 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
5158    vector elements (not bits) for a vector with NELT elements.  */
5159 static void
5160 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
5161                               vec_perm_builder *sel)
5162 {
5163   /* The encoding is a single stepped pattern.  Any wrap-around is handled
5164      by vec_perm_indices.  */
5165   sel->new_vector (nelt, 1, 3);
5166   for (unsigned int i = 0; i < 3; i++)
5167     sel->quick_push (i + offset);
5168 }
5169
5170 /* Checks whether the target supports whole-vector shifts for vectors of mode
5171    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
5172    it supports vec_perm_const with masks for all necessary shift amounts.  */
5173 static bool
5174 have_whole_vector_shift (machine_mode mode)
5175 {
5176   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
5177     return true;
5178
5179   /* Variable-length vectors should be handled via the optab.  */
5180   unsigned int nelt;
5181   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5182     return false;
5183
5184   vec_perm_builder sel;
5185   vec_perm_indices indices;
5186   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
5187     {
5188       calc_vec_perm_mask_for_shift (i, nelt, &sel);
5189       indices.new_vector (sel, 2, nelt);
5190       if (!can_vec_perm_const_p (mode, mode, indices, false))
5191         return false;
5192     }
5193   return true;
5194 }
5195
5196 /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
5197    multiplication operands have differing signs and (b) we intend
5198    to emulate the operation using a series of signed DOT_PROD_EXPRs.
5199    See vect_emulate_mixed_dot_prod for the actual sequence used.  */
5200
5201 static bool
5202 vect_is_emulated_mixed_dot_prod (loop_vec_info loop_vinfo,
5203                                  stmt_vec_info stmt_info)
5204 {
5205   gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
5206   if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
5207     return false;
5208
5209   tree rhs1 = gimple_assign_rhs1 (assign);
5210   tree rhs2 = gimple_assign_rhs2 (assign);
5211   if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
5212     return false;
5213
5214   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5215   gcc_assert (reduc_info->is_reduc_info);
5216   return !directly_supported_p (DOT_PROD_EXPR,
5217                                 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info),
5218                                 optab_vector_mixed_sign);
5219 }
5220
5221 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
5222    functions. Design better to avoid maintenance issues.  */
5223
5224 /* Function vect_model_reduction_cost.
5225
5226    Models cost for a reduction operation, including the vector ops
5227    generated within the strip-mine loop in some cases, the initial
5228    definition before the loop, and the epilogue code that must be generated.  */
5229
5230 static void
5231 vect_model_reduction_cost (loop_vec_info loop_vinfo,
5232                            stmt_vec_info stmt_info, internal_fn reduc_fn,
5233                            vect_reduction_type reduction_type,
5234                            int ncopies, stmt_vector_for_cost *cost_vec)
5235 {
5236   int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
5237   tree vectype;
5238   machine_mode mode;
5239   class loop *loop = NULL;
5240
5241   if (loop_vinfo)
5242     loop = LOOP_VINFO_LOOP (loop_vinfo);
5243
5244   /* Condition reductions generate two reductions in the loop.  */
5245   if (reduction_type == COND_REDUCTION)
5246     ncopies *= 2;
5247
5248   vectype = STMT_VINFO_VECTYPE (stmt_info);
5249   mode = TYPE_MODE (vectype);
5250   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5251
5252   gimple_match_op op;
5253   if (!gimple_extract_op (orig_stmt_info->stmt, &op))
5254     gcc_unreachable ();
5255
5256   bool emulated_mixed_dot_prod
5257     = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
5258   if (reduction_type == EXTRACT_LAST_REDUCTION)
5259     /* No extra instructions are needed in the prologue.  The loop body
5260        operations are costed in vectorizable_condition.  */
5261     inside_cost = 0;
5262   else if (reduction_type == FOLD_LEFT_REDUCTION)
5263     {
5264       /* No extra instructions needed in the prologue.  */
5265       prologue_cost = 0;
5266
5267       if (reduc_fn != IFN_LAST)
5268         /* Count one reduction-like operation per vector.  */
5269         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
5270                                         stmt_info, 0, vect_body);
5271       else
5272         {
5273           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
5274           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
5275           inside_cost = record_stmt_cost (cost_vec, nelements,
5276                                           vec_to_scalar, stmt_info, 0,
5277                                           vect_body);
5278           inside_cost += record_stmt_cost (cost_vec, nelements,
5279                                            scalar_stmt, stmt_info, 0,
5280                                            vect_body);
5281         }
5282     }
5283   else
5284     {
5285       /* Add in the cost of the initial definitions.  */
5286       int prologue_stmts;
5287       if (reduction_type == COND_REDUCTION)
5288         /* For cond reductions we have four vectors: initial index, step,
5289            initial result of the data reduction, initial value of the index
5290            reduction.  */
5291         prologue_stmts = 4;
5292       else if (emulated_mixed_dot_prod)
5293         /* We need the initial reduction value and two invariants:
5294            one that contains the minimum signed value and one that
5295            contains half of its negative.  */
5296         prologue_stmts = 3;
5297       else
5298         prologue_stmts = 1;
5299       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
5300                                          scalar_to_vec, stmt_info, 0,
5301                                          vect_prologue);
5302     }
5303
5304   /* Determine cost of epilogue code.
5305
5306      We have a reduction operator that will reduce the vector in one statement.
5307      Also requires scalar extract.  */
5308
5309   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
5310     {
5311       if (reduc_fn != IFN_LAST)
5312         {
5313           if (reduction_type == COND_REDUCTION)
5314             {
5315               /* An EQ stmt and an COND_EXPR stmt.  */
5316               epilogue_cost += record_stmt_cost (cost_vec, 2,
5317                                                  vector_stmt, stmt_info, 0,
5318                                                  vect_epilogue);
5319               /* Reduction of the max index and a reduction of the found
5320                  values.  */
5321               epilogue_cost += record_stmt_cost (cost_vec, 2,
5322                                                  vec_to_scalar, stmt_info, 0,
5323                                                  vect_epilogue);
5324               /* A broadcast of the max value.  */
5325               epilogue_cost += record_stmt_cost (cost_vec, 1,
5326                                                  scalar_to_vec, stmt_info, 0,
5327                                                  vect_epilogue);
5328             }
5329           else
5330             {
5331               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
5332                                                  stmt_info, 0, vect_epilogue);
5333               epilogue_cost += record_stmt_cost (cost_vec, 1,
5334                                                  vec_to_scalar, stmt_info, 0,
5335                                                  vect_epilogue);
5336             }
5337         }
5338       else if (reduction_type == COND_REDUCTION)
5339         {
5340           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
5341           /* Extraction of scalar elements.  */
5342           epilogue_cost += record_stmt_cost (cost_vec,
5343                                              2 * estimated_nunits,
5344                                              vec_to_scalar, stmt_info, 0,
5345                                              vect_epilogue);
5346           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
5347           epilogue_cost += record_stmt_cost (cost_vec,
5348                                              2 * estimated_nunits - 3,
5349                                              scalar_stmt, stmt_info, 0,
5350                                              vect_epilogue);
5351         }
5352       else if (reduction_type == EXTRACT_LAST_REDUCTION
5353                || reduction_type == FOLD_LEFT_REDUCTION)
5354         /* No extra instructions need in the epilogue.  */
5355         ;
5356       else
5357         {
5358           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5359           tree bitsize = TYPE_SIZE (op.type);
5360           int element_bitsize = tree_to_uhwi (bitsize);
5361           int nelements = vec_size_in_bits / element_bitsize;
5362
5363           if (op.code == COND_EXPR)
5364             op.code = MAX_EXPR;
5365
5366           /* We have a whole vector shift available.  */
5367           if (VECTOR_MODE_P (mode)
5368               && directly_supported_p (op.code, vectype)
5369               && have_whole_vector_shift (mode))
5370             {
5371               /* Final reduction via vector shifts and the reduction operator.
5372                  Also requires scalar extract.  */
5373               epilogue_cost += record_stmt_cost (cost_vec,
5374                                                  exact_log2 (nelements) * 2,
5375                                                  vector_stmt, stmt_info, 0,
5376                                                  vect_epilogue);
5377               epilogue_cost += record_stmt_cost (cost_vec, 1,
5378                                                  vec_to_scalar, stmt_info, 0,
5379                                                  vect_epilogue);
5380             }
5381           else
5382             /* Use extracts and reduction op for final reduction.  For N
5383                elements, we have N extracts and N-1 reduction ops.  */
5384             epilogue_cost += record_stmt_cost (cost_vec,
5385                                                nelements + nelements - 1,
5386                                                vector_stmt, stmt_info, 0,
5387                                                vect_epilogue);
5388         }
5389     }
5390
5391   if (dump_enabled_p ())
5392     dump_printf (MSG_NOTE,
5393                  "vect_model_reduction_cost: inside_cost = %d, "
5394                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
5395                  prologue_cost, epilogue_cost);
5396 }
5397
5398 /* SEQ is a sequence of instructions that initialize the reduction
5399    described by REDUC_INFO.  Emit them in the appropriate place.  */
5400
5401 static void
5402 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
5403                                 stmt_vec_info reduc_info, gimple *seq)
5404 {
5405   if (reduc_info->reused_accumulator)
5406     {
5407       /* When reusing an accumulator from the main loop, we only need
5408          initialization instructions if the main loop can be skipped.
5409          In that case, emit the initialization instructions at the end
5410          of the guard block that does the skip.  */
5411       edge skip_edge = loop_vinfo->skip_main_loop_edge;
5412       gcc_assert (skip_edge);
5413       gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
5414       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
5415     }
5416   else
5417     {
5418       /* The normal case: emit the initialization instructions on the
5419          preheader edge.  */
5420       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5421       gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
5422     }
5423 }
5424
5425 /* Function get_initial_def_for_reduction
5426
5427    Input:
5428    REDUC_INFO - the info_for_reduction
5429    INIT_VAL - the initial value of the reduction variable
5430    NEUTRAL_OP - a value that has no effect on the reduction, as per
5431                 neutral_op_for_reduction
5432
5433    Output:
5434    Return a vector variable, initialized according to the operation that
5435         STMT_VINFO performs. This vector will be used as the initial value
5436         of the vector of partial results.
5437
5438    The value we need is a vector in which element 0 has value INIT_VAL
5439    and every other element has value NEUTRAL_OP.  */
5440
5441 static tree
5442 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
5443                                stmt_vec_info reduc_info,
5444                                tree init_val, tree neutral_op)
5445 {
5446   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5447   tree scalar_type = TREE_TYPE (init_val);
5448   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
5449   tree init_def;
5450   gimple_seq stmts = NULL;
5451
5452   gcc_assert (vectype);
5453
5454   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
5455               || SCALAR_FLOAT_TYPE_P (scalar_type));
5456
5457   gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
5458               || loop == (gimple_bb (reduc_info->stmt))->loop_father);
5459
5460   if (operand_equal_p (init_val, neutral_op))
5461     {
5462       /* If both elements are equal then the vector described above is
5463          just a splat.  */
5464       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5465       init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
5466     }
5467   else
5468     {
5469       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5470       init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
5471       if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
5472         {
5473           /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
5474              element 0.  */
5475           init_def = gimple_build_vector_from_val (&stmts, vectype,
5476                                                    neutral_op);
5477           init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
5478                                    vectype, init_def, init_val);
5479         }
5480       else
5481         {
5482           /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}.  */
5483           tree_vector_builder elts (vectype, 1, 2);
5484           elts.quick_push (init_val);
5485           elts.quick_push (neutral_op);
5486           init_def = gimple_build_vector (&stmts, &elts);
5487         }
5488     }
5489
5490   if (stmts)
5491     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
5492   return init_def;
5493 }
5494
5495 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
5496    which performs a reduction involving GROUP_SIZE scalar statements.
5497    NUMBER_OF_VECTORS is the number of vector defs to create.  If NEUTRAL_OP
5498    is nonnull, introducing extra elements of that value will not change the
5499    result.  */
5500
5501 static void
5502 get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
5503                                 stmt_vec_info reduc_info,
5504                                 vec<tree> *vec_oprnds,
5505                                 unsigned int number_of_vectors,
5506                                 unsigned int group_size, tree neutral_op)
5507 {
5508   vec<tree> &initial_values = reduc_info->reduc_initial_values;
5509   unsigned HOST_WIDE_INT nunits;
5510   unsigned j, number_of_places_left_in_vector;
5511   tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
5512   unsigned int i;
5513
5514   gcc_assert (group_size == initial_values.length () || neutral_op);
5515
5516   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5517      created vectors. It is greater than 1 if unrolling is performed.
5518
5519      For example, we have two scalar operands, s1 and s2 (e.g., group of
5520      strided accesses of size two), while NUNITS is four (i.e., four scalars
5521      of this type can be packed in a vector).  The output vector will contain
5522      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
5523      will be 2).
5524
5525      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
5526      vectors containing the operands.
5527
5528      For example, NUNITS is four as before, and the group size is 8
5529      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
5530      {s5, s6, s7, s8}.  */
5531
5532   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
5533     nunits = group_size;
5534
5535   number_of_places_left_in_vector = nunits;
5536   bool constant_p = true;
5537   tree_vector_builder elts (vector_type, nunits, 1);
5538   elts.quick_grow (nunits);
5539   gimple_seq ctor_seq = NULL;
5540   for (j = 0; j < nunits * number_of_vectors; ++j)
5541     {
5542       tree op;
5543       i = j % group_size;
5544
5545       /* Get the def before the loop.  In reduction chain we have only
5546          one initial value.  Else we have as many as PHIs in the group.  */
5547       if (i >= initial_values.length () || (j > i && neutral_op))
5548         op = neutral_op;
5549       else
5550         op = initial_values[i];
5551
5552       /* Create 'vect_ = {op0,op1,...,opn}'.  */
5553       number_of_places_left_in_vector--;
5554       elts[nunits - number_of_places_left_in_vector - 1] = op;
5555       if (!CONSTANT_CLASS_P (op))
5556         constant_p = false;
5557
5558       if (number_of_places_left_in_vector == 0)
5559         {
5560           tree init;
5561           if (constant_p && !neutral_op
5562               ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
5563               : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
5564             /* Build the vector directly from ELTS.  */
5565             init = gimple_build_vector (&ctor_seq, &elts);
5566           else if (neutral_op)
5567             {
5568               /* Build a vector of the neutral value and shift the
5569                  other elements into place.  */
5570               init = gimple_build_vector_from_val (&ctor_seq, vector_type,
5571                                                    neutral_op);
5572               int k = nunits;
5573               while (k > 0 && elts[k - 1] == neutral_op)
5574                 k -= 1;
5575               while (k > 0)
5576                 {
5577                   k -= 1;
5578                   init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
5579                                        vector_type, init, elts[k]);
5580                 }
5581             }
5582           else
5583             {
5584               /* First time round, duplicate ELTS to fill the
5585                  required number of vectors.  */
5586               duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
5587                                         elts, number_of_vectors, *vec_oprnds);
5588               break;
5589             }
5590           vec_oprnds->quick_push (init);
5591
5592           number_of_places_left_in_vector = nunits;
5593           elts.new_vector (vector_type, nunits, 1);
5594           elts.quick_grow (nunits);
5595           constant_p = true;
5596         }
5597     }
5598   if (ctor_seq != NULL)
5599     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
5600 }
5601
5602 /* For a statement STMT_INFO taking part in a reduction operation return
5603    the stmt_vec_info the meta information is stored on.  */
5604
5605 stmt_vec_info
5606 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
5607 {
5608   stmt_info = vect_orig_stmt (stmt_info);
5609   gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
5610   if (!is_a <gphi *> (stmt_info->stmt)
5611       || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
5612     stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5613   gphi *phi = as_a <gphi *> (stmt_info->stmt);
5614   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5615     {
5616       if (gimple_phi_num_args (phi) == 1)
5617         stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5618     }
5619   else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
5620     {
5621       stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
5622       if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
5623         stmt_info = info;
5624     }
5625   return stmt_info;
5626 }
5627
5628 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
5629    REDUC_INFO can build on.  Adjust REDUC_INFO and return true if so, otherwise
5630    return false.  */
5631
5632 static bool
5633 vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
5634                                 stmt_vec_info reduc_info)
5635 {
5636   loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
5637   if (!main_loop_vinfo)
5638     return false;
5639
5640   if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
5641     return false;
5642
5643   unsigned int num_phis = reduc_info->reduc_initial_values.length ();
5644   auto_vec<tree, 16> main_loop_results (num_phis);
5645   auto_vec<tree, 16> initial_values (num_phis);
5646   if (edge main_loop_edge = loop_vinfo->main_loop_edge)
5647     {
5648       /* The epilogue loop can be entered either from the main loop or
5649          from an earlier guard block.  */
5650       edge skip_edge = loop_vinfo->skip_main_loop_edge;
5651       for (tree incoming_value : reduc_info->reduc_initial_values)
5652         {
5653           /* Look for:
5654
5655                INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
5656                                     INITIAL_VALUE(guard block)>.  */
5657           gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
5658
5659           gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
5660           gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
5661
5662           tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
5663           tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
5664
5665           main_loop_results.quick_push (from_main_loop);
5666           initial_values.quick_push (from_skip);
5667         }
5668     }
5669   else
5670     /* The main loop dominates the epilogue loop.  */
5671     main_loop_results.splice (reduc_info->reduc_initial_values);
5672
5673   /* See if the main loop has the kind of accumulator we need.  */
5674   vect_reusable_accumulator *accumulator
5675     = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
5676   if (!accumulator
5677       || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
5678       || !std::equal (main_loop_results.begin (), main_loop_results.end (),
5679                       accumulator->reduc_info->reduc_scalar_results.begin ()))
5680     return false;
5681
5682   /* Handle the case where we can reduce wider vectors to narrower ones.  */
5683   tree vectype = STMT_VINFO_VECTYPE (reduc_info);
5684   tree old_vectype = TREE_TYPE (accumulator->reduc_input);
5685   unsigned HOST_WIDE_INT m;
5686   if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
5687                             TYPE_VECTOR_SUBPARTS (vectype), &m))
5688     return false;
5689   /* Check the intermediate vector types and operations are available.  */
5690   tree prev_vectype = old_vectype;
5691   poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
5692   while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
5693     {
5694       intermediate_nunits = exact_div (intermediate_nunits, 2);
5695       tree intermediate_vectype = get_related_vectype_for_scalar_type
5696         (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
5697       if (!intermediate_vectype
5698           || !directly_supported_p (STMT_VINFO_REDUC_CODE (reduc_info),
5699                                     intermediate_vectype)
5700           || !can_vec_extract (TYPE_MODE (prev_vectype),
5701                                TYPE_MODE (intermediate_vectype)))
5702         return false;
5703       prev_vectype = intermediate_vectype;
5704     }
5705
5706   /* Non-SLP reductions might apply an adjustment after the reduction
5707      operation, in order to simplify the initialization of the accumulator.
5708      If the epilogue loop carries on from where the main loop left off,
5709      it should apply the same adjustment to the final reduction result.
5710
5711      If the epilogue loop can also be entered directly (rather than via
5712      the main loop), we need to be able to handle that case in the same way,
5713      with the same adjustment.  (In principle we could add a PHI node
5714      to select the correct adjustment, but in practice that shouldn't be
5715      necessary.)  */
5716   tree main_adjustment
5717     = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5718   if (loop_vinfo->main_loop_edge && main_adjustment)
5719     {
5720       gcc_assert (num_phis == 1);
5721       tree initial_value = initial_values[0];
5722       /* Check that we can use INITIAL_VALUE as the adjustment and
5723          initialize the accumulator with a neutral value instead.  */
5724       if (!operand_equal_p (initial_value, main_adjustment))
5725         return false;
5726       code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5727       initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
5728                                                     code, initial_value);
5729     }
5730   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5731   reduc_info->reduc_initial_values.truncate (0);
5732   reduc_info->reduc_initial_values.splice (initial_values);
5733   reduc_info->reused_accumulator = accumulator;
5734   return true;
5735 }
5736
5737 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5738    CODE emitting stmts before GSI.  Returns a vector def of VECTYPE.  */
5739
5740 static tree
5741 vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5742                             gimple_seq *seq)
5743 {
5744   unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5745   unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5746   tree stype = TREE_TYPE (vectype);
5747   tree new_temp = vec_def;
5748   while (nunits > nunits1)
5749     {
5750       nunits /= 2;
5751       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5752                                                            stype, nunits);
5753       unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5754
5755       /* The target has to make sure we support lowpart/highpart
5756          extraction, either via direct vector extract or through
5757          an integer mode punning.  */
5758       tree dst1, dst2;
5759       gimple *epilog_stmt;
5760       if (convert_optab_handler (vec_extract_optab,
5761                                  TYPE_MODE (TREE_TYPE (new_temp)),
5762                                  TYPE_MODE (vectype1))
5763           != CODE_FOR_nothing)
5764         {
5765           /* Extract sub-vectors directly once vec_extract becomes
5766              a conversion optab.  */
5767           dst1 = make_ssa_name (vectype1);
5768           epilog_stmt
5769               = gimple_build_assign (dst1, BIT_FIELD_REF,
5770                                      build3 (BIT_FIELD_REF, vectype1,
5771                                              new_temp, TYPE_SIZE (vectype1),
5772                                              bitsize_int (0)));
5773           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5774           dst2 =  make_ssa_name (vectype1);
5775           epilog_stmt
5776               = gimple_build_assign (dst2, BIT_FIELD_REF,
5777                                      build3 (BIT_FIELD_REF, vectype1,
5778                                              new_temp, TYPE_SIZE (vectype1),
5779                                              bitsize_int (bitsize)));
5780           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5781         }
5782       else
5783         {
5784           /* Extract via punning to appropriately sized integer mode
5785              vector.  */
5786           tree eltype = build_nonstandard_integer_type (bitsize, 1);
5787           tree etype = build_vector_type (eltype, 2);
5788           gcc_assert (convert_optab_handler (vec_extract_optab,
5789                                              TYPE_MODE (etype),
5790                                              TYPE_MODE (eltype))
5791                       != CODE_FOR_nothing);
5792           tree tem = make_ssa_name (etype);
5793           epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5794                                              build1 (VIEW_CONVERT_EXPR,
5795                                                      etype, new_temp));
5796           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5797           new_temp = tem;
5798           tem = make_ssa_name (eltype);
5799           epilog_stmt
5800               = gimple_build_assign (tem, BIT_FIELD_REF,
5801                                      build3 (BIT_FIELD_REF, eltype,
5802                                              new_temp, TYPE_SIZE (eltype),
5803                                              bitsize_int (0)));
5804           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5805           dst1 = make_ssa_name (vectype1);
5806           epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5807                                              build1 (VIEW_CONVERT_EXPR,
5808                                                      vectype1, tem));
5809           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5810           tem = make_ssa_name (eltype);
5811           epilog_stmt
5812               = gimple_build_assign (tem, BIT_FIELD_REF,
5813                                      build3 (BIT_FIELD_REF, eltype,
5814                                              new_temp, TYPE_SIZE (eltype),
5815                                              bitsize_int (bitsize)));
5816           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5817           dst2 =  make_ssa_name (vectype1);
5818           epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5819                                              build1 (VIEW_CONVERT_EXPR,
5820                                                      vectype1, tem));
5821           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5822         }
5823
5824       new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5825     }
5826
5827   return new_temp;
5828 }
5829
5830 /* Function vect_create_epilog_for_reduction
5831
5832    Create code at the loop-epilog to finalize the result of a reduction
5833    computation.
5834
5835    STMT_INFO is the scalar reduction stmt that is being vectorized.
5836    SLP_NODE is an SLP node containing a group of reduction statements. The
5837      first one in this group is STMT_INFO.
5838    SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5839    REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5840      (counting from 0)
5841
5842    This function:
5843    1. Completes the reduction def-use cycles.
5844    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5845       by calling the function specified by REDUC_FN if available, or by
5846       other means (whole-vector shifts or a scalar loop).
5847       The function also creates a new phi node at the loop exit to preserve
5848       loop-closed form, as illustrated below.
5849
5850      The flow at the entry to this function:
5851
5852         loop:
5853           vec_def = phi <vec_init, null>        # REDUCTION_PHI
5854           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5855           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5856         loop_exit:
5857           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5858           use <s_out0>
5859           use <s_out0>
5860
5861      The above is transformed by this function into:
5862
5863         loop:
5864           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
5865           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5866           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5867         loop_exit:
5868           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5869           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5870           v_out2 = reduce <v_out1>
5871           s_out3 = extract_field <v_out2, 0>
5872           s_out4 = adjust_result <s_out3>
5873           use <s_out4>
5874           use <s_out4>
5875 */
5876
5877 static void
5878 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5879                                   stmt_vec_info stmt_info,
5880                                   slp_tree slp_node,
5881                                   slp_instance slp_node_instance)
5882 {
5883   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5884   gcc_assert (reduc_info->is_reduc_info);
5885   /* For double reductions we need to get at the inner loop reduction
5886      stmt which has the meta info attached.  Our stmt_info is that of the
5887      loop-closed PHI of the inner loop which we remember as
5888      def for the reduction PHI generation.  */
5889   bool double_reduc = false;
5890   stmt_vec_info rdef_info = stmt_info;
5891   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5892     {
5893       gcc_assert (!slp_node);
5894       double_reduc = true;
5895       stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5896                                             (stmt_info->stmt, 0));
5897       stmt_info = vect_stmt_to_vectorize (stmt_info);
5898     }
5899   gphi *reduc_def_stmt
5900     = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
5901   code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5902   internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
5903   tree vectype;
5904   machine_mode mode;
5905   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
5906   basic_block exit_bb;
5907   tree scalar_dest;
5908   tree scalar_type;
5909   gimple *new_phi = NULL, *phi = NULL;
5910   gimple_stmt_iterator exit_gsi;
5911   tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5912   gimple *epilog_stmt = NULL;
5913   gimple *exit_phi;
5914   tree bitsize;
5915   tree def;
5916   tree orig_name, scalar_result;
5917   imm_use_iterator imm_iter, phi_imm_iter;
5918   use_operand_p use_p, phi_use_p;
5919   gimple *use_stmt;
5920   auto_vec<tree> reduc_inputs;
5921   int j, i;
5922   vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
5923   unsigned int group_size = 1, k;
5924   auto_vec<gimple *> phis;
5925   /* SLP reduction without reduction chain, e.g.,
5926      # a1 = phi <a2, a0>
5927      # b1 = phi <b2, b0>
5928      a2 = operation (a1)
5929      b2 = operation (b1)  */
5930   bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5931   bool direct_slp_reduc;
5932   tree induction_index = NULL_TREE;
5933
5934   if (slp_node)
5935     group_size = SLP_TREE_LANES (slp_node);
5936
5937   if (nested_in_vect_loop_p (loop, stmt_info))
5938     {
5939       outer_loop = loop;
5940       loop = loop->inner;
5941       gcc_assert (!slp_node && double_reduc);
5942     }
5943
5944   vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
5945   gcc_assert (vectype);
5946   mode = TYPE_MODE (vectype);
5947
5948   tree induc_val = NULL_TREE;
5949   tree adjustment_def = NULL;
5950   if (slp_node)
5951     ;
5952   else
5953     {
5954       /* Optimize: for induction condition reduction, if we can't use zero
5955          for induc_val, use initial_def.  */
5956       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5957         induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
5958       else if (double_reduc)
5959         ;
5960       else
5961         adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
5962     }
5963
5964   stmt_vec_info single_live_out_stmt[] = { stmt_info };
5965   array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
5966   if (slp_reduc)
5967     /* All statements produce live-out values.  */
5968     live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
5969   else if (slp_node)
5970     {
5971       /* The last statement in the reduction chain produces the live-out
5972          value.  Note SLP optimization can shuffle scalar stmts to
5973          optimize permutations so we have to search for the last stmt.  */
5974       for (k = 0; k < group_size; ++k)
5975         if (!REDUC_GROUP_NEXT_ELEMENT (SLP_TREE_SCALAR_STMTS (slp_node)[k]))
5976           {
5977             single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5978             break;
5979           }
5980     }
5981
5982   unsigned vec_num;
5983   int ncopies;
5984   if (slp_node)
5985     {
5986       vec_num = SLP_TREE_VEC_DEFS (slp_node_instance->reduc_phis).length ();
5987       ncopies = 1;
5988     }
5989   else
5990     {
5991       stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
5992       vec_num = 1;
5993       ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
5994     }
5995
5996   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5997      which is updated with the current index of the loop for every match of
5998      the original loop's cond_expr (VEC_STMT).  This results in a vector
5999      containing the last time the condition passed for that vector lane.
6000      The first match will be a 1 to allow 0 to be used for non-matching
6001      indexes.  If there are no matches at all then the vector will be all
6002      zeroes.
6003
6004      PR92772: This algorithm is broken for architectures that support
6005      masked vectors, but do not provide fold_extract_last.  */
6006   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
6007     {
6008       auto_vec<std::pair<tree, bool>, 2> ccompares;
6009       stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
6010       cond_info = vect_stmt_to_vectorize (cond_info);
6011       while (cond_info != reduc_info)
6012         {
6013           if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
6014             {
6015               gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
6016               gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
6017               ccompares.safe_push
6018                 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
6019                                  STMT_VINFO_REDUC_IDX (cond_info) == 2));
6020             }
6021           cond_info
6022             = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
6023                                                  1 + STMT_VINFO_REDUC_IDX
6024                                                         (cond_info)));
6025           cond_info = vect_stmt_to_vectorize (cond_info);
6026         }
6027       gcc_assert (ccompares.length () != 0);
6028
6029       tree indx_before_incr, indx_after_incr;
6030       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
6031       int scalar_precision
6032         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
6033       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
6034       tree cr_index_vector_type = get_related_vectype_for_scalar_type
6035         (TYPE_MODE (vectype), cr_index_scalar_type,
6036          TYPE_VECTOR_SUBPARTS (vectype));
6037
6038       /* First we create a simple vector induction variable which starts
6039          with the values {1,2,3,...} (SERIES_VECT) and increments by the
6040          vector size (STEP).  */
6041
6042       /* Create a {1,2,3,...} vector.  */
6043       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
6044
6045       /* Create a vector of the step value.  */
6046       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
6047       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
6048
6049       /* Create an induction variable.  */
6050       gimple_stmt_iterator incr_gsi;
6051       bool insert_after;
6052       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
6053       create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, &incr_gsi,
6054                  insert_after, &indx_before_incr, &indx_after_incr);
6055
6056       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
6057          filled with zeros (VEC_ZERO).  */
6058
6059       /* Create a vector of 0s.  */
6060       tree zero = build_zero_cst (cr_index_scalar_type);
6061       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
6062
6063       /* Create a vector phi node.  */
6064       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
6065       new_phi = create_phi_node (new_phi_tree, loop->header);
6066       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
6067                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
6068
6069       /* Now take the condition from the loops original cond_exprs
6070          and produce a new cond_exprs (INDEX_COND_EXPR) which for
6071          every match uses values from the induction variable
6072          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
6073          (NEW_PHI_TREE).
6074          Finally, we update the phi (NEW_PHI_TREE) to take the value of
6075          the new cond_expr (INDEX_COND_EXPR).  */
6076       gimple_seq stmts = NULL;
6077       for (int i = ccompares.length () - 1; i != -1; --i)
6078         {
6079           tree ccompare = ccompares[i].first;
6080           if (ccompares[i].second)
6081             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
6082                                          cr_index_vector_type,
6083                                          ccompare,
6084                                          indx_before_incr, new_phi_tree);
6085           else
6086             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
6087                                          cr_index_vector_type,
6088                                          ccompare,
6089                                          new_phi_tree, indx_before_incr);
6090         }
6091       gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
6092
6093       /* Update the phi with the vec cond.  */
6094       induction_index = new_phi_tree;
6095       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
6096                    loop_latch_edge (loop), UNKNOWN_LOCATION);
6097     }
6098
6099   /* 2. Create epilog code.
6100         The reduction epilog code operates across the elements of the vector
6101         of partial results computed by the vectorized loop.
6102         The reduction epilog code consists of:
6103
6104         step 1: compute the scalar result in a vector (v_out2)
6105         step 2: extract the scalar result (s_out3) from the vector (v_out2)
6106         step 3: adjust the scalar result (s_out3) if needed.
6107
6108         Step 1 can be accomplished using one the following three schemes:
6109           (scheme 1) using reduc_fn, if available.
6110           (scheme 2) using whole-vector shifts, if available.
6111           (scheme 3) using a scalar loop. In this case steps 1+2 above are
6112                      combined.
6113
6114           The overall epilog code looks like this:
6115
6116           s_out0 = phi <s_loop>         # original EXIT_PHI
6117           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
6118           v_out2 = reduce <v_out1>              # step 1
6119           s_out3 = extract_field <v_out2, 0>    # step 2
6120           s_out4 = adjust_result <s_out3>       # step 3
6121
6122           (step 3 is optional, and steps 1 and 2 may be combined).
6123           Lastly, the uses of s_out0 are replaced by s_out4.  */
6124
6125
6126   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
6127          v_out1 = phi <VECT_DEF>
6128          Store them in NEW_PHIS.  */
6129   if (double_reduc)
6130     loop = outer_loop;
6131   exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
6132   exit_gsi = gsi_after_labels (exit_bb);
6133   reduc_inputs.create (slp_node ? vec_num : ncopies);
6134   for (unsigned i = 0; i < vec_num; i++)
6135     {
6136       gimple_seq stmts = NULL;
6137       if (slp_node)
6138         def = vect_get_slp_vect_def (slp_node, i);
6139       else
6140         def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
6141       for (j = 0; j < ncopies; j++)
6142         {
6143           tree new_def = copy_ssa_name (def);
6144           phi = create_phi_node (new_def, exit_bb);
6145           if (j)
6146             def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
6147           SET_PHI_ARG_DEF (phi, LOOP_VINFO_IV_EXIT (loop_vinfo)->dest_idx, def);
6148           new_def = gimple_convert (&stmts, vectype, new_def);
6149           reduc_inputs.quick_push (new_def);
6150         }
6151       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6152     }
6153
6154   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
6155          (i.e. when reduc_fn is not available) and in the final adjustment
6156          code (if needed).  Also get the original scalar reduction variable as
6157          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
6158          represents a reduction pattern), the tree-code and scalar-def are
6159          taken from the original stmt that the pattern-stmt (STMT) replaces.
6160          Otherwise (it is a regular reduction) - the tree-code and scalar-def
6161          are taken from STMT.  */
6162
6163   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6164   if (orig_stmt_info != stmt_info)
6165     {
6166       /* Reduction pattern  */
6167       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6168       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
6169     }
6170
6171   scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
6172   scalar_type = TREE_TYPE (scalar_dest);
6173   scalar_results.truncate (0);
6174   scalar_results.reserve_exact (group_size);
6175   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
6176   bitsize = TYPE_SIZE (scalar_type);
6177
6178   /* True if we should implement SLP_REDUC using native reduction operations
6179      instead of scalar operations.  */
6180   direct_slp_reduc = (reduc_fn != IFN_LAST
6181                       && slp_reduc
6182                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
6183
6184   /* In case of reduction chain, e.g.,
6185      # a1 = phi <a3, a0>
6186      a2 = operation (a1)
6187      a3 = operation (a2),
6188
6189      we may end up with more than one vector result.  Here we reduce them
6190      to one vector.
6191
6192      The same is true for a SLP reduction, e.g.,
6193      # a1 = phi <a2, a0>
6194      # b1 = phi <b2, b0>
6195      a2 = operation (a1)
6196      b2 = operation (a2),
6197
6198      where we can end up with more than one vector as well.  We can
6199      easily accumulate vectors when the number of vector elements is
6200      a multiple of the SLP group size.
6201
6202      The same is true if we couldn't use a single defuse cycle.  */
6203   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6204       || direct_slp_reduc
6205       || (slp_reduc
6206           && constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype), group_size))
6207       || ncopies > 1)
6208     {
6209       gimple_seq stmts = NULL;
6210       tree single_input = reduc_inputs[0];
6211       for (k = 1; k < reduc_inputs.length (); k++)
6212         single_input = gimple_build (&stmts, code, vectype,
6213                                      single_input, reduc_inputs[k]);
6214       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6215
6216       reduc_inputs.truncate (0);
6217       reduc_inputs.safe_push (single_input);
6218     }
6219
6220   tree orig_reduc_input = reduc_inputs[0];
6221
6222   /* If this loop is an epilogue loop that can be skipped after the
6223      main loop, we can only share a reduction operation between the
6224      main loop and the epilogue if we put it at the target of the
6225      skip edge.
6226
6227      We can still reuse accumulators if this check fails.  Doing so has
6228      the minor(?) benefit of making the epilogue loop's scalar result
6229      independent of the main loop's scalar result.  */
6230   bool unify_with_main_loop_p = false;
6231   if (reduc_info->reused_accumulator
6232       && loop_vinfo->skip_this_loop_edge
6233       && single_succ_p (exit_bb)
6234       && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
6235     {
6236       unify_with_main_loop_p = true;
6237
6238       basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
6239       reduc_inputs[0] = make_ssa_name (vectype);
6240       gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
6241       add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
6242                    UNKNOWN_LOCATION);
6243       add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
6244                    loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
6245       exit_gsi = gsi_after_labels (reduc_block);
6246     }
6247
6248   /* Shouldn't be used beyond this point.  */
6249   exit_bb = nullptr;
6250
6251   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6252       && reduc_fn != IFN_LAST)
6253     {
6254       /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
6255          various data values where the condition matched and another vector
6256          (INDUCTION_INDEX) containing all the indexes of those matches.  We
6257          need to extract the last matching index (which will be the index with
6258          highest value) and use this to index into the data vector.
6259          For the case where there were no matches, the data vector will contain
6260          all default values and the index vector will be all zeros.  */
6261
6262       /* Get various versions of the type of the vector of indexes.  */
6263       tree index_vec_type = TREE_TYPE (induction_index);
6264       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
6265       tree index_scalar_type = TREE_TYPE (index_vec_type);
6266       tree index_vec_cmp_type = truth_type_for (index_vec_type);
6267
6268       /* Get an unsigned integer version of the type of the data vector.  */
6269       int scalar_precision
6270         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6271       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
6272       tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
6273                                                 vectype);
6274
6275       /* First we need to create a vector (ZERO_VEC) of zeros and another
6276          vector (MAX_INDEX_VEC) filled with the last matching index, which we
6277          can create using a MAX reduction and then expanding.
6278          In the case where the loop never made any matches, the max index will
6279          be zero.  */
6280
6281       /* Vector of {0, 0, 0,...}.  */
6282       tree zero_vec = build_zero_cst (vectype);
6283
6284       /* Find maximum value from the vector of found indexes.  */
6285       tree max_index = make_ssa_name (index_scalar_type);
6286       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6287                                                           1, induction_index);
6288       gimple_call_set_lhs (max_index_stmt, max_index);
6289       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
6290
6291       /* Vector of {max_index, max_index, max_index,...}.  */
6292       tree max_index_vec = make_ssa_name (index_vec_type);
6293       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
6294                                                       max_index);
6295       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
6296                                                         max_index_vec_rhs);
6297       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
6298
6299       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
6300          with the vector (INDUCTION_INDEX) of found indexes, choosing values
6301          from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
6302          otherwise.  Only one value should match, resulting in a vector
6303          (VEC_COND) with one data value and the rest zeros.
6304          In the case where the loop never made any matches, every index will
6305          match, resulting in a vector with all data values (which will all be
6306          the default value).  */
6307
6308       /* Compare the max index vector to the vector of found indexes to find
6309          the position of the max value.  */
6310       tree vec_compare = make_ssa_name (index_vec_cmp_type);
6311       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
6312                                                       induction_index,
6313                                                       max_index_vec);
6314       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
6315
6316       /* Use the compare to choose either values from the data vector or
6317          zero.  */
6318       tree vec_cond = make_ssa_name (vectype);
6319       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
6320                                                    vec_compare,
6321                                                    reduc_inputs[0],
6322                                                    zero_vec);
6323       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
6324
6325       /* Finally we need to extract the data value from the vector (VEC_COND)
6326          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
6327          reduction, but because this doesn't exist, we can use a MAX reduction
6328          instead.  The data value might be signed or a float so we need to cast
6329          it first.
6330          In the case where the loop never made any matches, the data values are
6331          all identical, and so will reduce down correctly.  */
6332
6333       /* Make the matched data values unsigned.  */
6334       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
6335       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
6336                                        vec_cond);
6337       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
6338                                                         VIEW_CONVERT_EXPR,
6339                                                         vec_cond_cast_rhs);
6340       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
6341
6342       /* Reduce down to a scalar value.  */
6343       tree data_reduc = make_ssa_name (scalar_type_unsigned);
6344       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6345                                                            1, vec_cond_cast);
6346       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
6347       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
6348
6349       /* Convert the reduced value back to the result type and set as the
6350          result.  */
6351       gimple_seq stmts = NULL;
6352       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
6353                                data_reduc);
6354       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6355       scalar_results.safe_push (new_temp);
6356     }
6357   else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6358            && reduc_fn == IFN_LAST)
6359     {
6360       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
6361          idx = 0;
6362          idx_val = induction_index[0];
6363          val = data_reduc[0];
6364          for (idx = 0, val = init, i = 0; i < nelts; ++i)
6365            if (induction_index[i] > idx_val)
6366              val = data_reduc[i], idx_val = induction_index[i];
6367          return val;  */
6368
6369       tree data_eltype = TREE_TYPE (vectype);
6370       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
6371       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
6372       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
6373       /* Enforced by vectorizable_reduction, which ensures we have target
6374          support before allowing a conditional reduction on variable-length
6375          vectors.  */
6376       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
6377       tree idx_val = NULL_TREE, val = NULL_TREE;
6378       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
6379         {
6380           tree old_idx_val = idx_val;
6381           tree old_val = val;
6382           idx_val = make_ssa_name (idx_eltype);
6383           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
6384                                              build3 (BIT_FIELD_REF, idx_eltype,
6385                                                      induction_index,
6386                                                      bitsize_int (el_size),
6387                                                      bitsize_int (off)));
6388           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6389           val = make_ssa_name (data_eltype);
6390           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
6391                                              build3 (BIT_FIELD_REF,
6392                                                      data_eltype,
6393                                                      reduc_inputs[0],
6394                                                      bitsize_int (el_size),
6395                                                      bitsize_int (off)));
6396           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6397           if (off != 0)
6398             {
6399               tree new_idx_val = idx_val;
6400               if (off != v_size - el_size)
6401                 {
6402                   new_idx_val = make_ssa_name (idx_eltype);
6403                   epilog_stmt = gimple_build_assign (new_idx_val,
6404                                                      MAX_EXPR, idx_val,
6405                                                      old_idx_val);
6406                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6407                 }
6408               tree cond = make_ssa_name (boolean_type_node);
6409               epilog_stmt = gimple_build_assign (cond, GT_EXPR,
6410                                                  idx_val, old_idx_val);
6411               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6412               tree new_val = make_ssa_name (data_eltype);
6413               epilog_stmt = gimple_build_assign (new_val, COND_EXPR,
6414                                                  cond, val, old_val);
6415               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6416               idx_val = new_idx_val;
6417               val = new_val;
6418             }
6419         }
6420       /* Convert the reduced value back to the result type and set as the
6421          result.  */
6422       gimple_seq stmts = NULL;
6423       val = gimple_convert (&stmts, scalar_type, val);
6424       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6425       scalar_results.safe_push (val);
6426     }
6427
6428   /* 2.3 Create the reduction code, using one of the three schemes described
6429          above. In SLP we simply need to extract all the elements from the
6430          vector (without reducing them), so we use scalar shifts.  */
6431   else if (reduc_fn != IFN_LAST && !slp_reduc)
6432     {
6433       tree tmp;
6434       tree vec_elem_type;
6435
6436       /* Case 1:  Create:
6437          v_out2 = reduc_expr <v_out1>  */
6438
6439       if (dump_enabled_p ())
6440         dump_printf_loc (MSG_NOTE, vect_location,
6441                          "Reduce using direct vector reduction.\n");
6442
6443       gimple_seq stmts = NULL;
6444       vec_elem_type = TREE_TYPE (vectype);
6445       new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
6446                                vec_elem_type, reduc_inputs[0]);
6447       new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6448       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6449
6450       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6451           && induc_val)
6452         {
6453           /* Earlier we set the initial value to be a vector if induc_val
6454              values.  Check the result and if it is induc_val then replace
6455              with the original initial value, unless induc_val is
6456              the same as initial_def already.  */
6457           tree zcompare = make_ssa_name (boolean_type_node);
6458           epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
6459                                              new_temp, induc_val);
6460           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6461           tree initial_def = reduc_info->reduc_initial_values[0];
6462           tmp = make_ssa_name (new_scalar_dest);
6463           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6464                                              initial_def, new_temp);
6465           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6466           new_temp = tmp;
6467         }
6468
6469       scalar_results.safe_push (new_temp);
6470     }
6471   else if (direct_slp_reduc)
6472     {
6473       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
6474          with the elements for other SLP statements replaced with the
6475          neutral value.  We can then do a normal reduction on each vector.  */
6476
6477       /* Enforced by vectorizable_reduction.  */
6478       gcc_assert (reduc_inputs.length () == 1);
6479       gcc_assert (pow2p_hwi (group_size));
6480
6481       gimple_seq seq = NULL;
6482
6483       /* Build a vector {0, 1, 2, ...}, with the same number of elements
6484          and the same element size as VECTYPE.  */
6485       tree index = build_index_vector (vectype, 0, 1);
6486       tree index_type = TREE_TYPE (index);
6487       tree index_elt_type = TREE_TYPE (index_type);
6488       tree mask_type = truth_type_for (index_type);
6489
6490       /* Create a vector that, for each element, identifies which of
6491          the REDUC_GROUP_SIZE results should use it.  */
6492       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
6493       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
6494                             build_vector_from_val (index_type, index_mask));
6495
6496       /* Get a neutral vector value.  This is simply a splat of the neutral
6497          scalar value if we have one, otherwise the initial scalar value
6498          is itself a neutral value.  */
6499       tree vector_identity = NULL_TREE;
6500       tree neutral_op = NULL_TREE;
6501       if (slp_node)
6502         {
6503           tree initial_value = NULL_TREE;
6504           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6505             initial_value = reduc_info->reduc_initial_values[0];
6506           neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
6507                                                  initial_value, false);
6508         }
6509       if (neutral_op)
6510         vector_identity = gimple_build_vector_from_val (&seq, vectype,
6511                                                         neutral_op);
6512       for (unsigned int i = 0; i < group_size; ++i)
6513         {
6514           /* If there's no univeral neutral value, we can use the
6515              initial scalar value from the original PHI.  This is used
6516              for MIN and MAX reduction, for example.  */
6517           if (!neutral_op)
6518             {
6519               tree scalar_value = reduc_info->reduc_initial_values[i];
6520               scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
6521                                              scalar_value);
6522               vector_identity = gimple_build_vector_from_val (&seq, vectype,
6523                                                               scalar_value);
6524             }
6525
6526           /* Calculate the equivalent of:
6527
6528              sel[j] = (index[j] == i);
6529
6530              which selects the elements of REDUC_INPUTS[0] that should
6531              be included in the result.  */
6532           tree compare_val = build_int_cst (index_elt_type, i);
6533           compare_val = build_vector_from_val (index_type, compare_val);
6534           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
6535                                    index, compare_val);
6536
6537           /* Calculate the equivalent of:
6538
6539              vec = seq ? reduc_inputs[0] : vector_identity;
6540
6541              VEC is now suitable for a full vector reduction.  */
6542           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
6543                                    sel, reduc_inputs[0], vector_identity);
6544
6545           /* Do the reduction and convert it to the appropriate type.  */
6546           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
6547                                       TREE_TYPE (vectype), vec);
6548           scalar = gimple_convert (&seq, scalar_type, scalar);
6549           scalar_results.safe_push (scalar);
6550         }
6551       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
6552     }
6553   else
6554     {
6555       bool reduce_with_shift;
6556       tree vec_temp;
6557
6558       gcc_assert (slp_reduc || reduc_inputs.length () == 1);
6559
6560       /* See if the target wants to do the final (shift) reduction
6561          in a vector mode of smaller size and first reduce upper/lower
6562          halves against each other.  */
6563       enum machine_mode mode1 = mode;
6564       tree stype = TREE_TYPE (vectype);
6565       unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6566       unsigned nunits1 = nunits;
6567       if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
6568           && reduc_inputs.length () == 1)
6569         {
6570           nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6571           /* For SLP reductions we have to make sure lanes match up, but
6572              since we're doing individual element final reduction reducing
6573              vector width here is even more important.
6574              ???  We can also separate lanes with permutes, for the common
6575              case of power-of-two group-size odd/even extracts would work.  */
6576           if (slp_reduc && nunits != nunits1)
6577             {
6578               nunits1 = least_common_multiple (nunits1, group_size);
6579               gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
6580             }
6581         }
6582       if (!slp_reduc
6583           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
6584         nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6585
6586       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
6587                                                            stype, nunits1);
6588       reduce_with_shift = have_whole_vector_shift (mode1);
6589       if (!VECTOR_MODE_P (mode1)
6590           || !directly_supported_p (code, vectype1))
6591         reduce_with_shift = false;
6592
6593       /* First reduce the vector to the desired vector size we should
6594          do shift reduction on by combining upper and lower halves.  */
6595       gimple_seq stmts = NULL;
6596       new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
6597                                              code, &stmts);
6598       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6599       reduc_inputs[0] = new_temp;
6600
6601       if (reduce_with_shift && !slp_reduc)
6602         {
6603           int element_bitsize = tree_to_uhwi (bitsize);
6604           /* Enforced by vectorizable_reduction, which disallows SLP reductions
6605              for variable-length vectors and also requires direct target support
6606              for loop reductions.  */
6607           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6608           int nelements = vec_size_in_bits / element_bitsize;
6609           vec_perm_builder sel;
6610           vec_perm_indices indices;
6611
6612           int elt_offset;
6613
6614           tree zero_vec = build_zero_cst (vectype1);
6615           /* Case 2: Create:
6616              for (offset = nelements/2; offset >= 1; offset/=2)
6617                 {
6618                   Create:  va' = vec_shift <va, offset>
6619                   Create:  va = vop <va, va'>
6620                 }  */
6621
6622           tree rhs;
6623
6624           if (dump_enabled_p ())
6625             dump_printf_loc (MSG_NOTE, vect_location,
6626                              "Reduce using vector shifts\n");
6627
6628           gimple_seq stmts = NULL;
6629           new_temp = gimple_convert (&stmts, vectype1, new_temp);
6630           for (elt_offset = nelements / 2;
6631                elt_offset >= 1;
6632                elt_offset /= 2)
6633             {
6634               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
6635               indices.new_vector (sel, 2, nelements);
6636               tree mask = vect_gen_perm_mask_any (vectype1, indices);
6637               new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
6638                                        new_temp, zero_vec, mask);
6639               new_temp = gimple_build (&stmts, code,
6640                                        vectype1, new_name, new_temp);
6641             }
6642           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6643
6644           /* 2.4  Extract the final scalar result.  Create:
6645              s_out3 = extract_field <v_out2, bitpos>  */
6646
6647           if (dump_enabled_p ())
6648             dump_printf_loc (MSG_NOTE, vect_location,
6649                              "extract scalar result\n");
6650
6651           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
6652                         bitsize, bitsize_zero_node);
6653           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
6654           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
6655           gimple_assign_set_lhs (epilog_stmt, new_temp);
6656           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6657           scalar_results.safe_push (new_temp);
6658         }
6659       else
6660         {
6661           /* Case 3: Create:
6662              s = extract_field <v_out2, 0>
6663              for (offset = element_size;
6664                   offset < vector_size;
6665                   offset += element_size;)
6666                {
6667                  Create:  s' = extract_field <v_out2, offset>
6668                  Create:  s = op <s, s'>  // For non SLP cases
6669                }  */
6670
6671           if (dump_enabled_p ())
6672             dump_printf_loc (MSG_NOTE, vect_location,
6673                              "Reduce using scalar code.\n");
6674
6675           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6676           int element_bitsize = tree_to_uhwi (bitsize);
6677           tree compute_type = TREE_TYPE (vectype);
6678           gimple_seq stmts = NULL;
6679           FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
6680             {
6681               int bit_offset;
6682               new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
6683                                        vec_temp, bitsize, bitsize_zero_node);
6684
6685               /* In SLP we don't need to apply reduction operation, so we just
6686                  collect s' values in SCALAR_RESULTS.  */
6687               if (slp_reduc)
6688                 scalar_results.safe_push (new_temp);
6689
6690               for (bit_offset = element_bitsize;
6691                    bit_offset < vec_size_in_bits;
6692                    bit_offset += element_bitsize)
6693                 {
6694                   tree bitpos = bitsize_int (bit_offset);
6695                   new_name = gimple_build (&stmts, BIT_FIELD_REF,
6696                                            compute_type, vec_temp,
6697                                            bitsize, bitpos);
6698                   if (slp_reduc)
6699                     {
6700                       /* In SLP we don't need to apply reduction operation, so
6701                          we just collect s' values in SCALAR_RESULTS.  */
6702                       new_temp = new_name;
6703                       scalar_results.safe_push (new_name);
6704                     }
6705                   else
6706                     new_temp = gimple_build (&stmts, code, compute_type,
6707                                              new_name, new_temp);
6708                 }
6709             }
6710
6711           /* The only case where we need to reduce scalar results in SLP, is
6712              unrolling.  If the size of SCALAR_RESULTS is greater than
6713              REDUC_GROUP_SIZE, we reduce them combining elements modulo
6714              REDUC_GROUP_SIZE.  */
6715           if (slp_reduc)
6716             {
6717               tree res, first_res, new_res;
6718
6719               /* Reduce multiple scalar results in case of SLP unrolling.  */
6720               for (j = group_size; scalar_results.iterate (j, &res);
6721                    j++)
6722                 {
6723                   first_res = scalar_results[j % group_size];
6724                   new_res = gimple_build (&stmts, code, compute_type,
6725                                           first_res, res);
6726                   scalar_results[j % group_size] = new_res;
6727                 }
6728               scalar_results.truncate (group_size);
6729               for (k = 0; k < group_size; k++)
6730                 scalar_results[k] = gimple_convert (&stmts, scalar_type,
6731                                                     scalar_results[k]);
6732             }
6733           else
6734             {
6735               /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
6736               new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6737               scalar_results.safe_push (new_temp);
6738             }
6739
6740           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6741         }
6742
6743       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6744           && induc_val)
6745         {
6746           /* Earlier we set the initial value to be a vector if induc_val
6747              values.  Check the result and if it is induc_val then replace
6748              with the original initial value, unless induc_val is
6749              the same as initial_def already.  */
6750           tree zcompare = make_ssa_name (boolean_type_node);
6751           epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR, new_temp,
6752                                              induc_val);
6753           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6754           tree initial_def = reduc_info->reduc_initial_values[0];
6755           tree tmp = make_ssa_name (new_scalar_dest);
6756           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6757                                              initial_def, new_temp);
6758           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6759           scalar_results[0] = tmp;
6760         }
6761     }
6762
6763   /* 2.5 Adjust the final result by the initial value of the reduction
6764          variable. (When such adjustment is not needed, then
6765          'adjustment_def' is zero).  For example, if code is PLUS we create:
6766          new_temp = loop_exit_def + adjustment_def  */
6767
6768   if (adjustment_def)
6769     {
6770       gcc_assert (!slp_reduc);
6771       gimple_seq stmts = NULL;
6772       if (double_reduc)
6773         {
6774           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6775           adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6776           new_temp = gimple_build (&stmts, code, vectype,
6777                                    reduc_inputs[0], adjustment_def);
6778         }
6779       else
6780         {
6781           new_temp = scalar_results[0];
6782           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6783           adjustment_def = gimple_convert (&stmts, TREE_TYPE (vectype),
6784                                            adjustment_def);
6785           new_temp = gimple_convert (&stmts, TREE_TYPE (vectype), new_temp);
6786           new_temp = gimple_build (&stmts, code, TREE_TYPE (vectype),
6787                                    new_temp, adjustment_def);
6788           new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6789         }
6790
6791       epilog_stmt = gimple_seq_last_stmt (stmts);
6792       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6793       scalar_results[0] = new_temp;
6794     }
6795
6796   /* Record this operation if it could be reused by the epilogue loop.  */
6797   if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION
6798       && reduc_inputs.length () == 1)
6799     loop_vinfo->reusable_accumulators.put (scalar_results[0],
6800                                            { orig_reduc_input, reduc_info });
6801
6802   if (double_reduc)
6803     loop = outer_loop;
6804
6805   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
6806           phis with new adjusted scalar results, i.e., replace use <s_out0>
6807           with use <s_out4>.
6808
6809      Transform:
6810         loop_exit:
6811           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6812           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6813           v_out2 = reduce <v_out1>
6814           s_out3 = extract_field <v_out2, 0>
6815           s_out4 = adjust_result <s_out3>
6816           use <s_out0>
6817           use <s_out0>
6818
6819      into:
6820
6821         loop_exit:
6822           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6823           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6824           v_out2 = reduce <v_out1>
6825           s_out3 = extract_field <v_out2, 0>
6826           s_out4 = adjust_result <s_out3>
6827           use <s_out4>
6828           use <s_out4> */
6829
6830   gcc_assert (live_out_stmts.size () == scalar_results.length ());
6831   for (k = 0; k < live_out_stmts.size (); k++)
6832     {
6833       stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6834       scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6835
6836       phis.create (3);
6837       /* Find the loop-closed-use at the loop exit of the original scalar
6838          result.  (The reduction result is expected to have two immediate uses,
6839          one at the latch block, and one at the loop exit).  For double
6840          reductions we are looking for exit phis of the outer loop.  */
6841       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6842         {
6843           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6844             {
6845               if (!is_gimple_debug (USE_STMT (use_p)))
6846                 phis.safe_push (USE_STMT (use_p));
6847             }
6848           else
6849             {
6850               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6851                 {
6852                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
6853
6854                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6855                     {
6856                       if (!flow_bb_inside_loop_p (loop,
6857                                              gimple_bb (USE_STMT (phi_use_p)))
6858                           && !is_gimple_debug (USE_STMT (phi_use_p)))
6859                         phis.safe_push (USE_STMT (phi_use_p));
6860                     }
6861                 }
6862             }
6863         }
6864
6865       FOR_EACH_VEC_ELT (phis, i, exit_phi)
6866         {
6867           /* Replace the uses:  */
6868           orig_name = PHI_RESULT (exit_phi);
6869
6870           /* Look for a single use at the target of the skip edge.  */
6871           if (unify_with_main_loop_p)
6872             {
6873               use_operand_p use_p;
6874               gimple *user;
6875               if (!single_imm_use (orig_name, &use_p, &user))
6876                 gcc_unreachable ();
6877               orig_name = gimple_get_lhs (user);
6878             }
6879
6880           scalar_result = scalar_results[k];
6881           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6882             {
6883               FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6884                 SET_USE (use_p, scalar_result);
6885               update_stmt (use_stmt);
6886             }
6887         }
6888
6889       phis.release ();
6890     }
6891 }
6892
6893 /* Return a vector of type VECTYPE that is equal to the vector select
6894    operation "MASK ? VEC : IDENTITY".  Insert the select statements
6895    before GSI.  */
6896
6897 static tree
6898 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6899                      tree vec, tree identity)
6900 {
6901   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6902   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6903                                           mask, vec, identity);
6904   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6905   return cond;
6906 }
6907
6908 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6909    order, starting with LHS.  Insert the extraction statements before GSI and
6910    associate the new scalar SSA names with variable SCALAR_DEST.
6911    Return the SSA name for the result.  */
6912
6913 static tree
6914 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6915                        tree_code code, tree lhs, tree vector_rhs)
6916 {
6917   tree vectype = TREE_TYPE (vector_rhs);
6918   tree scalar_type = TREE_TYPE (vectype);
6919   tree bitsize = TYPE_SIZE (scalar_type);
6920   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6921   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6922
6923   for (unsigned HOST_WIDE_INT bit_offset = 0;
6924        bit_offset < vec_size_in_bits;
6925        bit_offset += element_bitsize)
6926     {
6927       tree bitpos = bitsize_int (bit_offset);
6928       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6929                          bitsize, bitpos);
6930
6931       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6932       rhs = make_ssa_name (scalar_dest, stmt);
6933       gimple_assign_set_lhs (stmt, rhs);
6934       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6935
6936       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6937       tree new_name = make_ssa_name (scalar_dest, stmt);
6938       gimple_assign_set_lhs (stmt, new_name);
6939       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6940       lhs = new_name;
6941     }
6942   return lhs;
6943 }
6944
6945 /* Get a masked internal function equivalent to REDUC_FN.  VECTYPE_IN is the
6946    type of the vector input.  */
6947
6948 static internal_fn
6949 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6950 {
6951   internal_fn mask_reduc_fn;
6952   internal_fn mask_len_reduc_fn;
6953
6954   switch (reduc_fn)
6955     {
6956     case IFN_FOLD_LEFT_PLUS:
6957       mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6958       mask_len_reduc_fn = IFN_MASK_LEN_FOLD_LEFT_PLUS;
6959       break;
6960
6961     default:
6962       return IFN_LAST;
6963     }
6964
6965   if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
6966                                       OPTIMIZE_FOR_SPEED))
6967     return mask_reduc_fn;
6968   if (direct_internal_fn_supported_p (mask_len_reduc_fn, vectype_in,
6969                                       OPTIMIZE_FOR_SPEED))
6970     return mask_len_reduc_fn;
6971   return IFN_LAST;
6972 }
6973
6974 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
6975    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
6976    statement.  CODE is the operation performed by STMT_INFO and OPS are
6977    its scalar operands.  REDUC_INDEX is the index of the operand in
6978    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
6979    implements in-order reduction, or IFN_LAST if we should open-code it.
6980    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
6981    that should be used to control the operation in a fully-masked loop.  */
6982
6983 static bool
6984 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
6985                                stmt_vec_info stmt_info,
6986                                gimple_stmt_iterator *gsi,
6987                                gimple **vec_stmt, slp_tree slp_node,
6988                                gimple *reduc_def_stmt,
6989                                code_helper code, internal_fn reduc_fn,
6990                                tree *ops, int num_ops, tree vectype_in,
6991                                int reduc_index, vec_loop_masks *masks,
6992                                vec_loop_lens *lens)
6993 {
6994   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6995   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6996   internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
6997
6998   int ncopies;
6999   if (slp_node)
7000     ncopies = 1;
7001   else
7002     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7003
7004   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
7005   gcc_assert (ncopies == 1);
7006
7007   bool is_cond_op = false;
7008   if (!code.is_tree_code ())
7009     {
7010       code = conditional_internal_fn_code (internal_fn (code));
7011       gcc_assert (code != ERROR_MARK);
7012       is_cond_op = true;
7013     }
7014
7015   gcc_assert (TREE_CODE_LENGTH (tree_code (code)) == binary_op);
7016
7017   if (slp_node)
7018     {
7019       if (is_cond_op)
7020         {
7021           if (dump_enabled_p ())
7022             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7023                              "fold-left reduction on SLP not supported.\n");
7024           return false;
7025         }
7026
7027       gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
7028                             TYPE_VECTOR_SUBPARTS (vectype_in)));
7029     }
7030
7031   /* The operands either come from a binary operation or an IFN_COND operation.
7032      The former is a gimple assign with binary rhs and the latter is a
7033      gimple call with four arguments.  */
7034   gcc_assert (num_ops == 2 || num_ops == 4);
7035   tree op0, opmask;
7036   if (!is_cond_op)
7037     op0 = ops[1 - reduc_index];
7038   else
7039     {
7040       op0 = ops[2];
7041       opmask = ops[0];
7042       gcc_assert (!slp_node);
7043     }
7044
7045   int group_size = 1;
7046   stmt_vec_info scalar_dest_def_info;
7047   auto_vec<tree> vec_oprnds0, vec_opmask;
7048   if (slp_node)
7049     {
7050       auto_vec<vec<tree> > vec_defs (2);
7051       vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
7052       vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
7053       vec_defs[0].release ();
7054       vec_defs[1].release ();
7055       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7056       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
7057     }
7058   else
7059     {
7060       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7061                                      op0, &vec_oprnds0);
7062       scalar_dest_def_info = stmt_info;
7063
7064       /* For an IFN_COND_OP we also need the vector mask operand.  */
7065       if (is_cond_op)
7066           vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7067                                          opmask, &vec_opmask);
7068     }
7069
7070   gimple *sdef = scalar_dest_def_info->stmt;
7071   tree scalar_dest = gimple_get_lhs (sdef);
7072   tree scalar_type = TREE_TYPE (scalar_dest);
7073   tree reduc_var = gimple_phi_result (reduc_def_stmt);
7074
7075   int vec_num = vec_oprnds0.length ();
7076   gcc_assert (vec_num == 1 || slp_node);
7077   tree vec_elem_type = TREE_TYPE (vectype_out);
7078   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
7079
7080   tree vector_identity = NULL_TREE;
7081   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7082     {
7083       vector_identity = build_zero_cst (vectype_out);
7084       if (!HONOR_SIGNED_ZEROS (vectype_out))
7085         ;
7086       else
7087         {
7088           gcc_assert (!HONOR_SIGN_DEPENDENT_ROUNDING (vectype_out));
7089           vector_identity = const_unop (NEGATE_EXPR, vectype_out,
7090                                         vector_identity);
7091         }
7092     }
7093
7094   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
7095   int i;
7096   tree def0;
7097   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7098     {
7099       gimple *new_stmt;
7100       tree mask = NULL_TREE;
7101       tree len = NULL_TREE;
7102       tree bias = NULL_TREE;
7103       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7104         mask = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num, vectype_in, i);
7105       else if (is_cond_op)
7106         mask = vec_opmask[0];
7107       if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
7108         {
7109           len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
7110                                    i, 1);
7111           signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
7112           bias = build_int_cst (intQI_type_node, biasval);
7113           if (!is_cond_op)
7114             mask = build_minus_one_cst (truth_type_for (vectype_in));
7115         }
7116
7117       /* Handle MINUS by adding the negative.  */
7118       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
7119         {
7120           tree negated = make_ssa_name (vectype_out);
7121           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
7122           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
7123           def0 = negated;
7124         }
7125
7126       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
7127           && mask && mask_reduc_fn == IFN_LAST)
7128         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
7129                                     vector_identity);
7130
7131       /* On the first iteration the input is simply the scalar phi
7132          result, and for subsequent iterations it is the output of
7133          the preceding operation.  */
7134       if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
7135         {
7136           if (mask && len && mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
7137             new_stmt = gimple_build_call_internal (mask_reduc_fn, 5, reduc_var,
7138                                                    def0, mask, len, bias);
7139           else if (mask && mask_reduc_fn == IFN_MASK_FOLD_LEFT_PLUS)
7140             new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
7141                                                    def0, mask);
7142           else
7143             new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
7144                                                    def0);
7145           /* For chained SLP reductions the output of the previous reduction
7146              operation serves as the input of the next. For the final statement
7147              the output cannot be a temporary - we reuse the original
7148              scalar destination of the last statement.  */
7149           if (i != vec_num - 1)
7150             {
7151               gimple_set_lhs (new_stmt, scalar_dest_var);
7152               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
7153               gimple_set_lhs (new_stmt, reduc_var);
7154             }
7155         }
7156       else
7157         {
7158           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var,
7159                                              tree_code (code), reduc_var, def0);
7160           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
7161           /* Remove the statement, so that we can use the same code paths
7162              as for statements that we've just created.  */
7163           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
7164           gsi_remove (&tmp_gsi, true);
7165         }
7166
7167       if (i == vec_num - 1)
7168         {
7169           gimple_set_lhs (new_stmt, scalar_dest);
7170           vect_finish_replace_stmt (loop_vinfo,
7171                                     scalar_dest_def_info,
7172                                     new_stmt);
7173         }
7174       else
7175         vect_finish_stmt_generation (loop_vinfo,
7176                                      scalar_dest_def_info,
7177                                      new_stmt, gsi);
7178
7179       if (slp_node)
7180         slp_node->push_vec_def (new_stmt);
7181       else
7182         {
7183           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7184           *vec_stmt = new_stmt;
7185         }
7186     }
7187
7188   return true;
7189 }
7190
7191 /* Function is_nonwrapping_integer_induction.
7192
7193    Check if STMT_VINO (which is part of loop LOOP) both increments and
7194    does not cause overflow.  */
7195
7196 static bool
7197 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
7198 {
7199   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
7200   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
7201   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
7202   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
7203   widest_int ni, max_loop_value, lhs_max;
7204   wi::overflow_type overflow = wi::OVF_NONE;
7205
7206   /* Make sure the loop is integer based.  */
7207   if (TREE_CODE (base) != INTEGER_CST
7208       || TREE_CODE (step) != INTEGER_CST)
7209     return false;
7210
7211   /* Check that the max size of the loop will not wrap.  */
7212
7213   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
7214     return true;
7215
7216   if (! max_stmt_executions (loop, &ni))
7217     return false;
7218
7219   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
7220                             &overflow);
7221   if (overflow)
7222     return false;
7223
7224   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
7225                             TYPE_SIGN (lhs_type), &overflow);
7226   if (overflow)
7227     return false;
7228
7229   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
7230           <= TYPE_PRECISION (lhs_type));
7231 }
7232
7233 /* Check if masking can be supported by inserting a conditional expression.
7234    CODE is the code for the operation.  COND_FN is the conditional internal
7235    function, if it exists.  VECTYPE_IN is the type of the vector input.  */
7236 static bool
7237 use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
7238                          tree vectype_in)
7239 {
7240   if (cond_fn != IFN_LAST
7241       && direct_internal_fn_supported_p (cond_fn, vectype_in,
7242                                          OPTIMIZE_FOR_SPEED))
7243     return false;
7244
7245   if (code.is_tree_code ())
7246     switch (tree_code (code))
7247       {
7248       case DOT_PROD_EXPR:
7249       case SAD_EXPR:
7250         return true;
7251
7252       default:
7253         break;
7254       }
7255   return false;
7256 }
7257
7258 /* Insert a conditional expression to enable masked vectorization.  CODE is the
7259    code for the operation.  VOP is the array of operands.  MASK is the loop
7260    mask.  GSI is a statement iterator used to place the new conditional
7261    expression.  */
7262 static void
7263 build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
7264                       gimple_stmt_iterator *gsi)
7265 {
7266   switch (tree_code (code))
7267     {
7268     case DOT_PROD_EXPR:
7269       {
7270         tree vectype = TREE_TYPE (vop[1]);
7271         tree zero = build_zero_cst (vectype);
7272         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7273         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7274                                                mask, vop[1], zero);
7275         gsi_insert_before (gsi, select, GSI_SAME_STMT);
7276         vop[1] = masked_op1;
7277         break;
7278       }
7279
7280     case SAD_EXPR:
7281       {
7282         tree vectype = TREE_TYPE (vop[1]);
7283         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7284         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7285                                                mask, vop[1], vop[0]);
7286         gsi_insert_before (gsi, select, GSI_SAME_STMT);
7287         vop[1] = masked_op1;
7288         break;
7289       }
7290
7291     default:
7292       gcc_unreachable ();
7293     }
7294 }
7295
7296 /* Function vectorizable_reduction.
7297
7298    Check if STMT_INFO performs a reduction operation that can be vectorized.
7299    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
7300    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
7301    Return true if STMT_INFO is vectorizable in this way.
7302
7303    This function also handles reduction idioms (patterns) that have been
7304    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
7305    may be of this form:
7306      X = pattern_expr (arg0, arg1, ..., X)
7307    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
7308    sequence that had been detected and replaced by the pattern-stmt
7309    (STMT_INFO).
7310
7311    This function also handles reduction of condition expressions, for example:
7312      for (int i = 0; i < N; i++)
7313        if (a[i] < value)
7314          last = a[i];
7315    This is handled by vectorising the loop and creating an additional vector
7316    containing the loop indexes for which "a[i] < value" was true.  In the
7317    function epilogue this is reduced to a single max value and then used to
7318    index into the vector of results.
7319
7320    In some cases of reduction patterns, the type of the reduction variable X is
7321    different than the type of the other arguments of STMT_INFO.
7322    In such cases, the vectype that is used when transforming STMT_INFO into
7323    a vector stmt is different than the vectype that is used to determine the
7324    vectorization factor, because it consists of a different number of elements
7325    than the actual number of elements that are being operated upon in parallel.
7326
7327    For example, consider an accumulation of shorts into an int accumulator.
7328    On some targets it's possible to vectorize this pattern operating on 8
7329    shorts at a time (hence, the vectype for purposes of determining the
7330    vectorization factor should be V8HI); on the other hand, the vectype that
7331    is used to create the vector form is actually V4SI (the type of the result).
7332
7333    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
7334    indicates what is the actual level of parallelism (V8HI in the example), so
7335    that the right vectorization factor would be derived.  This vectype
7336    corresponds to the type of arguments to the reduction stmt, and should *NOT*
7337    be used to create the vectorized stmt.  The right vectype for the vectorized
7338    stmt is obtained from the type of the result X:
7339       get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7340
7341    This means that, contrary to "regular" reductions (or "regular" stmts in
7342    general), the following equation:
7343       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7344    does *NOT* necessarily hold for reduction patterns.  */
7345
7346 bool
7347 vectorizable_reduction (loop_vec_info loop_vinfo,
7348                         stmt_vec_info stmt_info, slp_tree slp_node,
7349                         slp_instance slp_node_instance,
7350                         stmt_vector_for_cost *cost_vec)
7351 {
7352   tree vectype_in = NULL_TREE;
7353   tree vectype_op[3] = { NULL_TREE, NULL_TREE, NULL_TREE };
7354   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7355   enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
7356   stmt_vec_info cond_stmt_vinfo = NULL;
7357   int i;
7358   int ncopies;
7359   bool single_defuse_cycle = false;
7360   bool nested_cycle = false;
7361   bool double_reduc = false;
7362   int vec_num;
7363   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
7364   tree cond_reduc_val = NULL_TREE;
7365
7366   /* Make sure it was already recognized as a reduction computation.  */
7367   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
7368       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
7369       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
7370     return false;
7371
7372   /* The stmt we store reduction analysis meta on.  */
7373   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7374   reduc_info->is_reduc_info = true;
7375
7376   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
7377     {
7378       if (is_a <gphi *> (stmt_info->stmt))
7379         {
7380           if (slp_node)
7381             {
7382               /* We eventually need to set a vector type on invariant
7383                  arguments.  */
7384               unsigned j;
7385               slp_tree child;
7386               FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
7387                 if (!vect_maybe_update_slp_op_vectype
7388                        (child, SLP_TREE_VECTYPE (slp_node)))
7389                   {
7390                     if (dump_enabled_p ())
7391                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7392                                        "incompatible vector types for "
7393                                        "invariants\n");
7394                     return false;
7395                   }
7396             }
7397           /* Analysis for double-reduction is done on the outer
7398              loop PHI, nested cycles have no further restrictions.  */
7399           STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
7400         }
7401       else
7402         STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7403       return true;
7404     }
7405
7406   stmt_vec_info orig_stmt_of_analysis = stmt_info;
7407   stmt_vec_info phi_info = stmt_info;
7408   if (!is_a <gphi *> (stmt_info->stmt))
7409     {
7410       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7411       return true;
7412     }
7413   if (slp_node)
7414     {
7415       slp_node_instance->reduc_phis = slp_node;
7416       /* ???  We're leaving slp_node to point to the PHIs, we only
7417          need it to get at the number of vector stmts which wasn't
7418          yet initialized for the instance root.  */
7419     }
7420   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7421     {
7422       use_operand_p use_p;
7423       gimple *use_stmt;
7424       bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
7425                                  &use_p, &use_stmt);
7426       gcc_assert (res);
7427       phi_info = loop_vinfo->lookup_stmt (use_stmt);
7428     }
7429
7430   /* PHIs should not participate in patterns.  */
7431   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7432   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7433
7434   /* Verify following REDUC_IDX from the latch def leads us back to the PHI
7435      and compute the reduction chain length.  Discover the real
7436      reduction operation stmt on the way (stmt_info and slp_for_stmt_info).  */
7437   tree reduc_def
7438     = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
7439                              loop_latch_edge
7440                                (gimple_bb (reduc_def_phi)->loop_father));
7441   unsigned reduc_chain_length = 0;
7442   bool only_slp_reduc_chain = true;
7443   stmt_info = NULL;
7444   slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
7445   while (reduc_def != PHI_RESULT (reduc_def_phi))
7446     {
7447       stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
7448       stmt_vec_info vdef = vect_stmt_to_vectorize (def);
7449       if (STMT_VINFO_REDUC_IDX (vdef) == -1)
7450         {
7451           if (dump_enabled_p ())
7452             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7453                              "reduction chain broken by patterns.\n");
7454           return false;
7455         }
7456       if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
7457         only_slp_reduc_chain = false;
7458       /* For epilogue generation live members of the chain need
7459          to point back to the PHI via their original stmt for
7460          info_for_reduction to work.  For SLP we need to look at
7461          all lanes here - even though we only will vectorize from
7462          the SLP node with live lane zero the other live lanes also
7463          need to be identified as part of a reduction to be able
7464          to skip code generation for them.  */
7465       if (slp_for_stmt_info)
7466         {
7467           for (auto s : SLP_TREE_SCALAR_STMTS (slp_for_stmt_info))
7468             if (STMT_VINFO_LIVE_P (s))
7469               STMT_VINFO_REDUC_DEF (vect_orig_stmt (s)) = phi_info;
7470         }
7471       else if (STMT_VINFO_LIVE_P (vdef))
7472         STMT_VINFO_REDUC_DEF (def) = phi_info;
7473       gimple_match_op op;
7474       if (!gimple_extract_op (vdef->stmt, &op))
7475         {
7476           if (dump_enabled_p ())
7477             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7478                              "reduction chain includes unsupported"
7479                              " statement type.\n");
7480           return false;
7481         }
7482       if (CONVERT_EXPR_CODE_P (op.code))
7483         {
7484           if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
7485             {
7486               if (dump_enabled_p ())
7487                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7488                                  "conversion in the reduction chain.\n");
7489               return false;
7490             }
7491         }
7492       else if (!stmt_info)
7493         /* First non-conversion stmt.  */
7494         stmt_info = vdef;
7495       reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
7496       reduc_chain_length++;
7497       if (!stmt_info && slp_node)
7498         slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
7499     }
7500   /* PHIs should not participate in patterns.  */
7501   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7502
7503   if (nested_in_vect_loop_p (loop, stmt_info))
7504     {
7505       loop = loop->inner;
7506       nested_cycle = true;
7507     }
7508
7509   /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
7510      element.  */
7511   if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7512     {
7513       gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
7514       stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
7515     }
7516   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7517     gcc_assert (slp_node
7518                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
7519
7520   /* 1. Is vectorizable reduction?  */
7521   /* Not supportable if the reduction variable is used in the loop, unless
7522      it's a reduction chain.  */
7523   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
7524       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7525     return false;
7526
7527   /* Reductions that are not used even in an enclosing outer-loop,
7528      are expected to be "live" (used out of the loop).  */
7529   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
7530       && !STMT_VINFO_LIVE_P (stmt_info))
7531     return false;
7532
7533   /* 2. Has this been recognized as a reduction pattern?
7534
7535      Check if STMT represents a pattern that has been recognized
7536      in earlier analysis stages.  For stmts that represent a pattern,
7537      the STMT_VINFO_RELATED_STMT field records the last stmt in
7538      the original sequence that constitutes the pattern.  */
7539
7540   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
7541   if (orig_stmt_info)
7542     {
7543       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
7544       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
7545     }
7546
7547   /* 3. Check the operands of the operation.  The first operands are defined
7548         inside the loop body. The last operand is the reduction variable,
7549         which is defined by the loop-header-phi.  */
7550
7551   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7552   STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
7553   gimple_match_op op;
7554   if (!gimple_extract_op (stmt_info->stmt, &op))
7555     gcc_unreachable ();
7556   bool lane_reduc_code_p = (op.code == DOT_PROD_EXPR
7557                             || op.code == WIDEN_SUM_EXPR
7558                             || op.code == SAD_EXPR);
7559
7560   if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
7561       && !SCALAR_FLOAT_TYPE_P (op.type))
7562     return false;
7563
7564   /* Do not try to vectorize bit-precision reductions.  */
7565   if (!type_has_mode_precision_p (op.type))
7566     return false;
7567
7568   /* For lane-reducing ops we're reducing the number of reduction PHIs
7569      which means the only use of that may be in the lane-reducing operation.  */
7570   if (lane_reduc_code_p
7571       && reduc_chain_length != 1
7572       && !only_slp_reduc_chain)
7573     {
7574       if (dump_enabled_p ())
7575         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7576                          "lane-reducing reduction with extra stmts.\n");
7577       return false;
7578     }
7579
7580   /* All uses but the last are expected to be defined in the loop.
7581      The last use is the reduction variable.  In case of nested cycle this
7582      assumption is not true: we use reduc_index to record the index of the
7583      reduction variable.  */
7584   slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
7585   /* We need to skip an extra operand for COND_EXPRs with embedded
7586      comparison.  */
7587   unsigned opno_adjust = 0;
7588   if (op.code == COND_EXPR && COMPARISON_CLASS_P (op.ops[0]))
7589     opno_adjust = 1;
7590   for (i = 0; i < (int) op.num_ops; i++)
7591     {
7592       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
7593       if (i == 0 && op.code == COND_EXPR)
7594         continue;
7595
7596       stmt_vec_info def_stmt_info;
7597       enum vect_def_type dt;
7598       if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
7599                                i + opno_adjust, &op.ops[i], &slp_op[i], &dt,
7600                                &vectype_op[i], &def_stmt_info))
7601         {
7602           if (dump_enabled_p ())
7603             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7604                              "use not simple.\n");
7605           return false;
7606         }
7607       if (i == STMT_VINFO_REDUC_IDX (stmt_info))
7608         continue;
7609
7610       /* For an IFN_COND_OP we might hit the reduction definition operand
7611          twice (once as definition, once as else).  */
7612       if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
7613         continue;
7614
7615       /* There should be only one cycle def in the stmt, the one
7616          leading to reduc_def.  */
7617       if (VECTORIZABLE_CYCLE_DEF (dt))
7618         return false;
7619
7620       if (!vectype_op[i])
7621         vectype_op[i]
7622           = get_vectype_for_scalar_type (loop_vinfo,
7623                                          TREE_TYPE (op.ops[i]), slp_op[i]);
7624
7625       /* To properly compute ncopies we are interested in the widest
7626          non-reduction input type in case we're looking at a widening
7627          accumulation that we later handle in vect_transform_reduction.  */
7628       if (lane_reduc_code_p
7629           && vectype_op[i]
7630           && (!vectype_in
7631               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
7632                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_op[i]))))))
7633         vectype_in = vectype_op[i];
7634
7635       if (op.code == COND_EXPR)
7636         {
7637           /* Record how the non-reduction-def value of COND_EXPR is defined.  */
7638           if (dt == vect_constant_def)
7639             {
7640               cond_reduc_dt = dt;
7641               cond_reduc_val = op.ops[i];
7642             }
7643           if (dt == vect_induction_def
7644               && def_stmt_info
7645               && is_nonwrapping_integer_induction (def_stmt_info, loop))
7646             {
7647               cond_reduc_dt = dt;
7648               cond_stmt_vinfo = def_stmt_info;
7649             }
7650         }
7651     }
7652   if (!vectype_in)
7653     vectype_in = STMT_VINFO_VECTYPE (phi_info);
7654   STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
7655
7656   enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
7657   STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
7658   /* If we have a condition reduction, see if we can simplify it further.  */
7659   if (v_reduc_type == COND_REDUCTION)
7660     {
7661       if (slp_node)
7662         return false;
7663
7664       /* When the condition uses the reduction value in the condition, fail.  */
7665       if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
7666         {
7667           if (dump_enabled_p ())
7668             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7669                              "condition depends on previous iteration\n");
7670           return false;
7671         }
7672
7673       if (reduc_chain_length == 1
7674           && (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST, vectype_in,
7675                                               OPTIMIZE_FOR_SPEED)
7676               || direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
7677                                                  vectype_in,
7678                                                  OPTIMIZE_FOR_SPEED)))
7679         {
7680           if (dump_enabled_p ())
7681             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7682                              "optimizing condition reduction with"
7683                              " FOLD_EXTRACT_LAST.\n");
7684           STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
7685         }
7686       else if (cond_reduc_dt == vect_induction_def)
7687         {
7688           tree base
7689             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
7690           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
7691
7692           gcc_assert (TREE_CODE (base) == INTEGER_CST
7693                       && TREE_CODE (step) == INTEGER_CST);
7694           cond_reduc_val = NULL_TREE;
7695           enum tree_code cond_reduc_op_code = ERROR_MARK;
7696           tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
7697           if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
7698             ;
7699           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
7700              above base; punt if base is the minimum value of the type for
7701              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
7702           else if (tree_int_cst_sgn (step) == -1)
7703             {
7704               cond_reduc_op_code = MIN_EXPR;
7705               if (tree_int_cst_sgn (base) == -1)
7706                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7707               else if (tree_int_cst_lt (base,
7708                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
7709                 cond_reduc_val
7710                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
7711             }
7712           else
7713             {
7714               cond_reduc_op_code = MAX_EXPR;
7715               if (tree_int_cst_sgn (base) == 1)
7716                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7717               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
7718                                         base))
7719                 cond_reduc_val
7720                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
7721             }
7722           if (cond_reduc_val)
7723             {
7724               if (dump_enabled_p ())
7725                 dump_printf_loc (MSG_NOTE, vect_location,
7726                                  "condition expression based on "
7727                                  "integer induction.\n");
7728               STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
7729               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
7730                 = cond_reduc_val;
7731               STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
7732             }
7733         }
7734       else if (cond_reduc_dt == vect_constant_def)
7735         {
7736           enum vect_def_type cond_initial_dt;
7737           tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
7738           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
7739           if (cond_initial_dt == vect_constant_def
7740               && types_compatible_p (TREE_TYPE (cond_initial_val),
7741                                      TREE_TYPE (cond_reduc_val)))
7742             {
7743               tree e = fold_binary (LE_EXPR, boolean_type_node,
7744                                     cond_initial_val, cond_reduc_val);
7745               if (e && (integer_onep (e) || integer_zerop (e)))
7746                 {
7747                   if (dump_enabled_p ())
7748                     dump_printf_loc (MSG_NOTE, vect_location,
7749                                      "condition expression based on "
7750                                      "compile time constant.\n");
7751                   /* Record reduction code at analysis stage.  */
7752                   STMT_VINFO_REDUC_CODE (reduc_info)
7753                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
7754                   STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
7755                 }
7756             }
7757         }
7758     }
7759
7760   if (STMT_VINFO_LIVE_P (phi_info))
7761     return false;
7762
7763   if (slp_node)
7764     ncopies = 1;
7765   else
7766     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7767
7768   gcc_assert (ncopies >= 1);
7769
7770   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
7771
7772   if (nested_cycle)
7773     {
7774       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
7775                   == vect_double_reduction_def);
7776       double_reduc = true;
7777     }
7778
7779   /* 4.2. Check support for the epilog operation.
7780
7781           If STMT represents a reduction pattern, then the type of the
7782           reduction variable may be different than the type of the rest
7783           of the arguments.  For example, consider the case of accumulation
7784           of shorts into an int accumulator; The original code:
7785                         S1: int_a = (int) short_a;
7786           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
7787
7788           was replaced with:
7789                         STMT: int_acc = widen_sum <short_a, int_acc>
7790
7791           This means that:
7792           1. The tree-code that is used to create the vector operation in the
7793              epilog code (that reduces the partial results) is not the
7794              tree-code of STMT, but is rather the tree-code of the original
7795              stmt from the pattern that STMT is replacing.  I.e, in the example
7796              above we want to use 'widen_sum' in the loop, but 'plus' in the
7797              epilog.
7798           2. The type (mode) we use to check available target support
7799              for the vector operation to be created in the *epilog*, is
7800              determined by the type of the reduction variable (in the example
7801              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7802              However the type (mode) we use to check available target support
7803              for the vector operation to be created *inside the loop*, is
7804              determined by the type of the other arguments to STMT (in the
7805              example we'd check this: optab_handler (widen_sum_optab,
7806              vect_short_mode)).
7807
7808           This is contrary to "regular" reductions, in which the types of all
7809           the arguments are the same as the type of the reduction variable.
7810           For "regular" reductions we can therefore use the same vector type
7811           (and also the same tree-code) when generating the epilog code and
7812           when generating the code inside the loop.  */
7813
7814   code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
7815
7816   /* If conversion might have created a conditional operation like
7817      IFN_COND_ADD already.  Use the internal code for the following checks.  */
7818   if (orig_code.is_internal_fn ())
7819     {
7820       tree_code new_code = conditional_internal_fn_code (internal_fn (orig_code));
7821       orig_code = new_code != ERROR_MARK ? new_code : orig_code;
7822     }
7823
7824   STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
7825
7826   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7827   if (reduction_type == TREE_CODE_REDUCTION)
7828     {
7829       /* Check whether it's ok to change the order of the computation.
7830          Generally, when vectorizing a reduction we change the order of the
7831          computation.  This may change the behavior of the program in some
7832          cases, so we need to check that this is ok.  One exception is when
7833          vectorizing an outer-loop: the inner-loop is executed sequentially,
7834          and therefore vectorizing reductions in the inner-loop during
7835          outer-loop vectorization is safe.  Likewise when we are vectorizing
7836          a series of reductions using SLP and the VF is one the reductions
7837          are performed in scalar order.  */
7838       if (slp_node
7839           && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7840           && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
7841         ;
7842       else if (needs_fold_left_reduction_p (op.type, orig_code))
7843         {
7844           /* When vectorizing a reduction chain w/o SLP the reduction PHI
7845              is not directy used in stmt.  */
7846           if (!only_slp_reduc_chain
7847               && reduc_chain_length != 1)
7848             {
7849               if (dump_enabled_p ())
7850                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7851                                  "in-order reduction chain without SLP.\n");
7852               return false;
7853             }
7854           STMT_VINFO_REDUC_TYPE (reduc_info)
7855             = reduction_type = FOLD_LEFT_REDUCTION;
7856         }
7857       else if (!commutative_binary_op_p (orig_code, op.type)
7858                || !associative_binary_op_p (orig_code, op.type))
7859         {
7860           if (dump_enabled_p ())
7861             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7862                             "reduction: not commutative/associative\n");
7863           return false;
7864         }
7865     }
7866
7867   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7868       && ncopies > 1)
7869     {
7870       if (dump_enabled_p ())
7871         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7872                          "multiple types in double reduction or condition "
7873                          "reduction or fold-left reduction.\n");
7874       return false;
7875     }
7876
7877   internal_fn reduc_fn = IFN_LAST;
7878   if (reduction_type == TREE_CODE_REDUCTION
7879       || reduction_type == FOLD_LEFT_REDUCTION
7880       || reduction_type == INTEGER_INDUC_COND_REDUCTION
7881       || reduction_type == CONST_COND_REDUCTION)
7882     {
7883       if (reduction_type == FOLD_LEFT_REDUCTION
7884           ? fold_left_reduction_fn (orig_code, &reduc_fn)
7885           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7886         {
7887           if (reduc_fn != IFN_LAST
7888               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
7889                                                   OPTIMIZE_FOR_SPEED))
7890             {
7891               if (dump_enabled_p ())
7892                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7893                                  "reduc op not supported by target.\n");
7894
7895               reduc_fn = IFN_LAST;
7896             }
7897         }
7898       else
7899         {
7900           if (!nested_cycle || double_reduc)
7901             {
7902               if (dump_enabled_p ())
7903                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7904                                  "no reduc code for scalar code.\n");
7905
7906               return false;
7907             }
7908         }
7909     }
7910   else if (reduction_type == COND_REDUCTION)
7911     {
7912       int scalar_precision
7913         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
7914       cr_index_scalar_type = make_unsigned_type (scalar_precision);
7915       cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
7916                                                 vectype_out);
7917
7918       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7919                                           OPTIMIZE_FOR_SPEED))
7920         reduc_fn = IFN_REDUC_MAX;
7921     }
7922   STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
7923
7924   if (reduction_type != EXTRACT_LAST_REDUCTION
7925       && (!nested_cycle || double_reduc)
7926       && reduc_fn == IFN_LAST
7927       && !nunits_out.is_constant ())
7928     {
7929       if (dump_enabled_p ())
7930         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7931                          "missing target support for reduction on"
7932                          " variable-length vectors.\n");
7933       return false;
7934     }
7935
7936   /* For SLP reductions, see if there is a neutral value we can use.  */
7937   tree neutral_op = NULL_TREE;
7938   if (slp_node)
7939     {
7940       tree initial_value = NULL_TREE;
7941       if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
7942         initial_value = vect_phi_initial_value (reduc_def_phi);
7943       neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7944                                              orig_code, initial_value);
7945     }
7946
7947   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7948     {
7949       /* We can't support in-order reductions of code such as this:
7950
7951            for (int i = 0; i < n1; ++i)
7952              for (int j = 0; j < n2; ++j)
7953                l += a[j];
7954
7955          since GCC effectively transforms the loop when vectorizing:
7956
7957            for (int i = 0; i < n1 / VF; ++i)
7958              for (int j = 0; j < n2; ++j)
7959                for (int k = 0; k < VF; ++k)
7960                  l += a[j];
7961
7962          which is a reassociation of the original operation.  */
7963       if (dump_enabled_p ())
7964         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7965                          "in-order double reduction not supported.\n");
7966
7967       return false;
7968     }
7969
7970   if (reduction_type == FOLD_LEFT_REDUCTION
7971       && slp_node
7972       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7973     {
7974       /* We cannot use in-order reductions in this case because there is
7975          an implicit reassociation of the operations involved.  */
7976       if (dump_enabled_p ())
7977         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7978                          "in-order unchained SLP reductions not supported.\n");
7979       return false;
7980     }
7981
7982   /* For double reductions, and for SLP reductions with a neutral value,
7983      we construct a variable-length initial vector by loading a vector
7984      full of the neutral value and then shift-and-inserting the start
7985      values into the low-numbered elements.  */
7986   if ((double_reduc || neutral_op)
7987       && !nunits_out.is_constant ()
7988       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7989                                           vectype_out, OPTIMIZE_FOR_SPEED))
7990     {
7991       if (dump_enabled_p ())
7992         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7993                          "reduction on variable-length vectors requires"
7994                          " target support for a vector-shift-and-insert"
7995                          " operation.\n");
7996       return false;
7997     }
7998
7999   /* Check extra constraints for variable-length unchained SLP reductions.  */
8000   if (slp_node
8001       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
8002       && !nunits_out.is_constant ())
8003     {
8004       /* We checked above that we could build the initial vector when
8005          there's a neutral element value.  Check here for the case in
8006          which each SLP statement has its own initial value and in which
8007          that value needs to be repeated for every instance of the
8008          statement within the initial vector.  */
8009       unsigned int group_size = SLP_TREE_LANES (slp_node);
8010       if (!neutral_op
8011           && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
8012                                               TREE_TYPE (vectype_out)))
8013         {
8014           if (dump_enabled_p ())
8015             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8016                              "unsupported form of SLP reduction for"
8017                              " variable-length vectors: cannot build"
8018                              " initial vector.\n");
8019           return false;
8020         }
8021       /* The epilogue code relies on the number of elements being a multiple
8022          of the group size.  The duplicate-and-interleave approach to setting
8023          up the initial vector does too.  */
8024       if (!multiple_p (nunits_out, group_size))
8025         {
8026           if (dump_enabled_p ())
8027             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8028                              "unsupported form of SLP reduction for"
8029                              " variable-length vectors: the vector size"
8030                              " is not a multiple of the number of results.\n");
8031           return false;
8032         }
8033     }
8034
8035   if (reduction_type == COND_REDUCTION)
8036     {
8037       widest_int ni;
8038
8039       if (! max_loop_iterations (loop, &ni))
8040         {
8041           if (dump_enabled_p ())
8042             dump_printf_loc (MSG_NOTE, vect_location,
8043                              "loop count not known, cannot create cond "
8044                              "reduction.\n");
8045           return false;
8046         }
8047       /* Convert backedges to iterations.  */
8048       ni += 1;
8049
8050       /* The additional index will be the same type as the condition.  Check
8051          that the loop can fit into this less one (because we'll use up the
8052          zero slot for when there are no matches).  */
8053       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
8054       if (wi::geu_p (ni, wi::to_widest (max_index)))
8055         {
8056           if (dump_enabled_p ())
8057             dump_printf_loc (MSG_NOTE, vect_location,
8058                              "loop size is greater than data size.\n");
8059           return false;
8060         }
8061     }
8062
8063   /* In case the vectorization factor (VF) is bigger than the number
8064      of elements that we can fit in a vectype (nunits), we have to generate
8065      more than one vector stmt - i.e - we need to "unroll" the
8066      vector stmt by a factor VF/nunits.  For more details see documentation
8067      in vectorizable_operation.  */
8068
8069   /* If the reduction is used in an outer loop we need to generate
8070      VF intermediate results, like so (e.g. for ncopies=2):
8071         r0 = phi (init, r0)
8072         r1 = phi (init, r1)
8073         r0 = x0 + r0;
8074         r1 = x1 + r1;
8075     (i.e. we generate VF results in 2 registers).
8076     In this case we have a separate def-use cycle for each copy, and therefore
8077     for each copy we get the vector def for the reduction variable from the
8078     respective phi node created for this copy.
8079
8080     Otherwise (the reduction is unused in the loop nest), we can combine
8081     together intermediate results, like so (e.g. for ncopies=2):
8082         r = phi (init, r)
8083         r = x0 + r;
8084         r = x1 + r;
8085    (i.e. we generate VF/2 results in a single register).
8086    In this case for each copy we get the vector def for the reduction variable
8087    from the vectorized reduction operation generated in the previous iteration.
8088
8089    This only works when we see both the reduction PHI and its only consumer
8090    in vectorizable_reduction and there are no intermediate stmts
8091    participating.  When unrolling we want each unrolled iteration to have its
8092    own reduction accumulator since one of the main goals of unrolling a
8093    reduction is to reduce the aggregate loop-carried latency.  */
8094   if (ncopies > 1
8095       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
8096       && reduc_chain_length == 1
8097       && loop_vinfo->suggested_unroll_factor == 1)
8098     single_defuse_cycle = true;
8099
8100   if (single_defuse_cycle || lane_reduc_code_p)
8101     {
8102       gcc_assert (op.code != COND_EXPR);
8103
8104       /* 4. Supportable by target?  */
8105       bool ok = true;
8106
8107       /* 4.1. check support for the operation in the loop
8108
8109          This isn't necessary for the lane reduction codes, since they
8110          can only be produced by pattern matching, and it's up to the
8111          pattern matcher to test for support.  The main reason for
8112          specifically skipping this step is to avoid rechecking whether
8113          mixed-sign dot-products can be implemented using signed
8114          dot-products.  */
8115       machine_mode vec_mode = TYPE_MODE (vectype_in);
8116       if (!lane_reduc_code_p
8117           && !directly_supported_p (op.code, vectype_in, optab_vector))
8118         {
8119           if (dump_enabled_p ())
8120             dump_printf (MSG_NOTE, "op not supported by target.\n");
8121           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
8122               || !vect_can_vectorize_without_simd_p (op.code))
8123             ok = false;
8124           else
8125             if (dump_enabled_p ())
8126               dump_printf (MSG_NOTE, "proceeding using word mode.\n");
8127         }
8128
8129       if (vect_emulated_vector_p (vectype_in)
8130           && !vect_can_vectorize_without_simd_p (op.code))
8131         {
8132           if (dump_enabled_p ())
8133             dump_printf (MSG_NOTE, "using word mode not possible.\n");
8134           return false;
8135         }
8136
8137       /* lane-reducing operations have to go through vect_transform_reduction.
8138          For the other cases try without the single cycle optimization.  */
8139       if (!ok)
8140         {
8141           if (lane_reduc_code_p)
8142             return false;
8143           else
8144             single_defuse_cycle = false;
8145         }
8146     }
8147   STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
8148
8149   /* If the reduction stmt is one of the patterns that have lane
8150      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
8151   if ((ncopies > 1 && ! single_defuse_cycle)
8152       && lane_reduc_code_p)
8153     {
8154       if (dump_enabled_p ())
8155         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8156                          "multi def-use cycle not possible for lane-reducing "
8157                          "reduction operation\n");
8158       return false;
8159     }
8160
8161   if (slp_node
8162       && !(!single_defuse_cycle
8163            && !lane_reduc_code_p
8164            && reduction_type != FOLD_LEFT_REDUCTION))
8165     for (i = 0; i < (int) op.num_ops; i++)
8166       if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
8167         {
8168           if (dump_enabled_p ())
8169             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8170                              "incompatible vector types for invariants\n");
8171           return false;
8172         }
8173
8174   if (slp_node)
8175     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8176   else
8177     vec_num = 1;
8178
8179   vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
8180                              reduction_type, ncopies, cost_vec);
8181   /* Cost the reduction op inside the loop if transformed via
8182      vect_transform_reduction.  Otherwise this is costed by the
8183      separate vectorizable_* routines.  */
8184   if (single_defuse_cycle || lane_reduc_code_p)
8185     {
8186       int factor = 1;
8187       if (vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info))
8188         /* Three dot-products and a subtraction.  */
8189         factor = 4;
8190       record_stmt_cost (cost_vec, ncopies * factor, vector_stmt,
8191                         stmt_info, 0, vect_body);
8192     }
8193
8194   if (dump_enabled_p ()
8195       && reduction_type == FOLD_LEFT_REDUCTION)
8196     dump_printf_loc (MSG_NOTE, vect_location,
8197                      "using an in-order (fold-left) reduction.\n");
8198   STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
8199   /* All but single defuse-cycle optimized, lane-reducing and fold-left
8200      reductions go through their own vectorizable_* routines.  */
8201   if (!single_defuse_cycle
8202       && !lane_reduc_code_p
8203       && reduction_type != FOLD_LEFT_REDUCTION)
8204     {
8205       stmt_vec_info tem
8206         = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
8207       if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
8208         {
8209           gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
8210           tem = REDUC_GROUP_FIRST_ELEMENT (tem);
8211         }
8212       STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
8213       STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
8214     }
8215   else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8216     {
8217       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8218       vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8219       internal_fn cond_fn = get_conditional_internal_fn (op.code, op.type);
8220
8221       if (reduction_type != FOLD_LEFT_REDUCTION
8222           && !use_mask_by_cond_expr_p (op.code, cond_fn, vectype_in)
8223           && (cond_fn == IFN_LAST
8224               || !direct_internal_fn_supported_p (cond_fn, vectype_in,
8225                                                   OPTIMIZE_FOR_SPEED)))
8226         {
8227           if (dump_enabled_p ())
8228             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8229                              "can't operate on partial vectors because"
8230                              " no conditional operation is available.\n");
8231           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8232         }
8233       else if (reduction_type == FOLD_LEFT_REDUCTION
8234                && reduc_fn == IFN_LAST
8235                && !expand_vec_cond_expr_p (vectype_in,
8236                                            truth_type_for (vectype_in),
8237                                            SSA_NAME))
8238         {
8239           if (dump_enabled_p ())
8240             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8241                              "can't operate on partial vectors because"
8242                              " no conditional operation is available.\n");
8243           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8244         }
8245       else if (reduction_type == FOLD_LEFT_REDUCTION
8246                && internal_fn_mask_index (reduc_fn) == -1
8247                && FLOAT_TYPE_P (vectype_in)
8248                && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in))
8249         {
8250           if (dump_enabled_p ())
8251             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8252                              "can't operate on partial vectors because"
8253                              " signed zeros cannot be preserved.\n");
8254           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8255         }
8256       else
8257         {
8258           internal_fn mask_reduc_fn
8259             = get_masked_reduction_fn (reduc_fn, vectype_in);
8260
8261           if (mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
8262             vect_record_loop_len (loop_vinfo, lens, ncopies * vec_num,
8263                                   vectype_in, 1);
8264           else
8265             vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
8266                                    vectype_in, NULL);
8267         }
8268     }
8269   return true;
8270 }
8271
8272 /* STMT_INFO is a dot-product reduction whose multiplication operands
8273    have different signs.  Emit a sequence to emulate the operation
8274    using a series of signed DOT_PROD_EXPRs and return the last
8275    statement generated.  VEC_DEST is the result of the vector operation
8276    and VOP lists its inputs.  */
8277
8278 static gassign *
8279 vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8280                              gimple_stmt_iterator *gsi, tree vec_dest,
8281                              tree vop[3])
8282 {
8283   tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest));
8284   tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0]));
8285   tree narrow_elttype = TREE_TYPE (narrow_vectype);
8286   gimple *new_stmt;
8287
8288   /* Make VOP[0] the unsigned operand VOP[1] the signed operand.  */
8289   if (!TYPE_UNSIGNED (TREE_TYPE (vop[0])))
8290     std::swap (vop[0], vop[1]);
8291
8292   /* Convert all inputs to signed types.  */
8293   for (int i = 0; i < 3; ++i)
8294     if (TYPE_UNSIGNED (TREE_TYPE (vop[i])))
8295       {
8296         tree tmp = make_ssa_name (signed_type_for (TREE_TYPE (vop[i])));
8297         new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]);
8298         vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8299         vop[i] = tmp;
8300       }
8301
8302   /* In the comments below we assume 8-bit inputs for simplicity,
8303      but the approach works for any full integer type.  */
8304
8305   /* Create a vector of -128.  */
8306   tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype);
8307   tree min_narrow = build_vector_from_val (narrow_vectype,
8308                                            min_narrow_elttype);
8309
8310   /* Create a vector of 64.  */
8311   auto half_wi = wi::lrshift (wi::to_wide (min_narrow_elttype), 1);
8312   tree half_narrow = wide_int_to_tree (narrow_elttype, half_wi);
8313   half_narrow = build_vector_from_val (narrow_vectype, half_narrow);
8314
8315   /* Emit: SUB_RES = VOP[0] - 128.  */
8316   tree sub_res = make_ssa_name (narrow_vectype);
8317   new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow);
8318   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8319
8320   /* Emit:
8321
8322        STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
8323        STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
8324        STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
8325
8326      on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
8327      Doing the two 64 * y steps first allows more time to compute x.  */
8328   tree stage1 = make_ssa_name (wide_vectype);
8329   new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR,
8330                                   vop[1], half_narrow, vop[2]);
8331   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8332
8333   tree stage2 = make_ssa_name (wide_vectype);
8334   new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR,
8335                                   vop[1], half_narrow, stage1);
8336   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8337
8338   tree stage3 = make_ssa_name (wide_vectype);
8339   new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR,
8340                                   sub_res, vop[1], stage2);
8341   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8342
8343   /* Convert STAGE3 to the reduction type.  */
8344   return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3);
8345 }
8346
8347 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
8348    value.  */
8349
8350 bool
8351 vect_transform_reduction (loop_vec_info loop_vinfo,
8352                           stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
8353                           gimple **vec_stmt, slp_tree slp_node)
8354 {
8355   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8356   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8357   int i;
8358   int ncopies;
8359   int vec_num;
8360
8361   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8362   gcc_assert (reduc_info->is_reduc_info);
8363
8364   if (nested_in_vect_loop_p (loop, stmt_info))
8365     {
8366       loop = loop->inner;
8367       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
8368     }
8369
8370   gimple_match_op op;
8371   if (!gimple_extract_op (stmt_info->stmt, &op))
8372     gcc_unreachable ();
8373
8374   /* All uses but the last are expected to be defined in the loop.
8375      The last use is the reduction variable.  In case of nested cycle this
8376      assumption is not true: we use reduc_index to record the index of the
8377      reduction variable.  */
8378   stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
8379   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
8380   int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
8381   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8382
8383   if (slp_node)
8384     {
8385       ncopies = 1;
8386       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8387     }
8388   else
8389     {
8390       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8391       vec_num = 1;
8392     }
8393
8394   code_helper code = canonicalize_code (op.code, op.type);
8395   internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
8396
8397   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8398   vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8399   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
8400
8401   /* Transform.  */
8402   tree new_temp = NULL_TREE;
8403   auto_vec<tree> vec_oprnds0;
8404   auto_vec<tree> vec_oprnds1;
8405   auto_vec<tree> vec_oprnds2;
8406   tree def0;
8407
8408   if (dump_enabled_p ())
8409     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
8410
8411   /* FORNOW: Multiple types are not supported for condition.  */
8412   if (code == COND_EXPR)
8413     gcc_assert (ncopies == 1);
8414
8415   /* A binary COND_OP reduction must have the same definition and else
8416      value. */
8417   bool cond_fn_p = code.is_internal_fn ()
8418     && conditional_internal_fn_code (internal_fn (code)) != ERROR_MARK;
8419   if (cond_fn_p)
8420     {
8421       gcc_assert (code == IFN_COND_ADD || code == IFN_COND_SUB
8422                   || code == IFN_COND_MUL || code == IFN_COND_AND
8423                   || code == IFN_COND_IOR || code == IFN_COND_XOR);
8424       gcc_assert (op.num_ops == 4 && (op.ops[1] == op.ops[3]));
8425     }
8426
8427   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8428
8429   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
8430   if (reduction_type == FOLD_LEFT_REDUCTION)
8431     {
8432       internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
8433       gcc_assert (code.is_tree_code () || cond_fn_p);
8434       return vectorize_fold_left_reduction
8435           (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
8436            code, reduc_fn, op.ops, op.num_ops, vectype_in,
8437            reduc_index, masks, lens);
8438     }
8439
8440   bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
8441   gcc_assert (single_defuse_cycle
8442               || code == DOT_PROD_EXPR
8443               || code == WIDEN_SUM_EXPR
8444               || code == SAD_EXPR);
8445
8446   /* Create the destination vector  */
8447   tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
8448   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
8449
8450   /* Get NCOPIES vector definitions for all operands except the reduction
8451      definition.  */
8452   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
8453                      single_defuse_cycle && reduc_index == 0
8454                      ? NULL_TREE : op.ops[0], &vec_oprnds0,
8455                      single_defuse_cycle && reduc_index == 1
8456                      ? NULL_TREE : op.ops[1], &vec_oprnds1,
8457                      op.num_ops == 4
8458                      || (op.num_ops == 3
8459                          && !(single_defuse_cycle && reduc_index == 2))
8460                      ? op.ops[2] : NULL_TREE, &vec_oprnds2);
8461
8462   /* For single def-use cycles get one copy of the vectorized reduction
8463      definition.  */
8464   if (single_defuse_cycle)
8465     {
8466       gcc_assert (!slp_node);
8467       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8468                                      op.ops[reduc_index],
8469                                      reduc_index == 0 ? &vec_oprnds0
8470                                      : (reduc_index == 1 ? &vec_oprnds1
8471                                         : &vec_oprnds2));
8472     }
8473
8474   bool emulated_mixed_dot_prod
8475     = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
8476   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
8477     {
8478       gimple *new_stmt;
8479       tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
8480       if (masked_loop_p && !mask_by_cond_expr)
8481         {
8482           /* No conditional ifns have been defined for dot-product yet.  */
8483           gcc_assert (code != DOT_PROD_EXPR);
8484
8485           /* Make sure that the reduction accumulator is vop[0].  */
8486           if (reduc_index == 1)
8487             {
8488               gcc_assert (commutative_binary_op_p (code, op.type));
8489               std::swap (vop[0], vop[1]);
8490             }
8491           tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8492                                           vec_num * ncopies, vectype_in, i);
8493           gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
8494                                                     vop[0], vop[1], vop[0]);
8495           new_temp = make_ssa_name (vec_dest, call);
8496           gimple_call_set_lhs (call, new_temp);
8497           gimple_call_set_nothrow (call, true);
8498           vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
8499           new_stmt = call;
8500         }
8501       else
8502         {
8503           if (op.num_ops >= 3)
8504             vop[2] = vec_oprnds2[i];
8505
8506           if (masked_loop_p && mask_by_cond_expr)
8507             {
8508               tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8509                                               vec_num * ncopies, vectype_in, i);
8510               build_vect_cond_expr (code, vop, mask, gsi);
8511             }
8512
8513           if (emulated_mixed_dot_prod)
8514             new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
8515                                                     vec_dest, vop);
8516
8517           else if (code.is_internal_fn () && !cond_fn_p)
8518             new_stmt = gimple_build_call_internal (internal_fn (code),
8519                                                    op.num_ops,
8520                                                    vop[0], vop[1], vop[2]);
8521           else if (code.is_internal_fn () && cond_fn_p)
8522             new_stmt = gimple_build_call_internal (internal_fn (code),
8523                                                    op.num_ops,
8524                                                    vop[0], vop[1], vop[2],
8525                                                    vop[1]);
8526           else
8527             new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
8528                                             vop[0], vop[1], vop[2]);
8529           new_temp = make_ssa_name (vec_dest, new_stmt);
8530           gimple_set_lhs (new_stmt, new_temp);
8531           vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8532         }
8533
8534       if (slp_node)
8535         slp_node->push_vec_def (new_stmt);
8536       else if (single_defuse_cycle
8537                && i < ncopies - 1)
8538         {
8539           if (reduc_index == 0)
8540             vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
8541           else if (reduc_index == 1)
8542             vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
8543           else if (reduc_index == 2)
8544             vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
8545         }
8546       else
8547         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8548     }
8549
8550   if (!slp_node)
8551     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8552
8553   return true;
8554 }
8555
8556 /* Transform phase of a cycle PHI.  */
8557
8558 bool
8559 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
8560                           stmt_vec_info stmt_info, gimple **vec_stmt,
8561                           slp_tree slp_node, slp_instance slp_node_instance)
8562 {
8563   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8564   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8565   int i;
8566   int ncopies;
8567   int j;
8568   bool nested_cycle = false;
8569   int vec_num;
8570
8571   if (nested_in_vect_loop_p (loop, stmt_info))
8572     {
8573       loop = loop->inner;
8574       nested_cycle = true;
8575     }
8576
8577   stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
8578   reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
8579   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8580   gcc_assert (reduc_info->is_reduc_info);
8581
8582   if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
8583       || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
8584     /* Leave the scalar phi in place.  */
8585     return true;
8586
8587   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8588   /* For a nested cycle we do not fill the above.  */
8589   if (!vectype_in)
8590     vectype_in = STMT_VINFO_VECTYPE (stmt_info);
8591   gcc_assert (vectype_in);
8592
8593   if (slp_node)
8594     {
8595       /* The size vect_schedule_slp_instance computes is off for us.  */
8596       vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
8597                                       * SLP_TREE_LANES (slp_node), vectype_in);
8598       ncopies = 1;
8599     }
8600   else
8601     {
8602       vec_num = 1;
8603       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8604     }
8605
8606   /* Check whether we should use a single PHI node and accumulate
8607      vectors to one before the backedge.  */
8608   if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
8609     ncopies = 1;
8610
8611   /* Create the destination vector  */
8612   gphi *phi = as_a <gphi *> (stmt_info->stmt);
8613   tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
8614                                                vectype_out);
8615
8616   /* Get the loop-entry arguments.  */
8617   tree vec_initial_def = NULL_TREE;
8618   auto_vec<tree> vec_initial_defs;
8619   if (slp_node)
8620     {
8621       vec_initial_defs.reserve (vec_num);
8622       if (nested_cycle)
8623         {
8624           unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
8625           vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
8626                              &vec_initial_defs);
8627         }
8628       else
8629         {
8630           gcc_assert (slp_node == slp_node_instance->reduc_phis);
8631           vec<tree> &initial_values = reduc_info->reduc_initial_values;
8632           vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
8633
8634           unsigned int num_phis = stmts.length ();
8635           if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
8636             num_phis = 1;
8637           initial_values.reserve (num_phis);
8638           for (unsigned int i = 0; i < num_phis; ++i)
8639             {
8640               gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
8641               initial_values.quick_push (vect_phi_initial_value (this_phi));
8642             }
8643           if (vec_num == 1)
8644             vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8645           if (!initial_values.is_empty ())
8646             {
8647               tree initial_value
8648                 = (num_phis == 1 ? initial_values[0] : NULL_TREE);
8649               code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8650               tree neutral_op
8651                 = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8652                                             code, initial_value);
8653               get_initial_defs_for_reduction (loop_vinfo, reduc_info,
8654                                               &vec_initial_defs, vec_num,
8655                                               stmts.length (), neutral_op);
8656             }
8657         }
8658     }
8659   else
8660     {
8661       /* Get at the scalar def before the loop, that defines the initial
8662          value of the reduction variable.  */
8663       tree initial_def = vect_phi_initial_value (phi);
8664       reduc_info->reduc_initial_values.safe_push (initial_def);
8665       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
8666          and we can't use zero for induc_val, use initial_def.  Similarly
8667          for REDUC_MIN and initial_def larger than the base.  */
8668       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
8669         {
8670           tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
8671           if (TREE_CODE (initial_def) == INTEGER_CST
8672               && !integer_zerop (induc_val)
8673               && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
8674                    && tree_int_cst_lt (initial_def, induc_val))
8675                   || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
8676                       && tree_int_cst_lt (induc_val, initial_def))))
8677             {
8678               induc_val = initial_def;
8679               /* Communicate we used the initial_def to epilouge
8680                  generation.  */
8681               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
8682             }
8683           vec_initial_def = build_vector_from_val (vectype_out, induc_val);
8684         }
8685       else if (nested_cycle)
8686         {
8687           /* Do not use an adjustment def as that case is not supported
8688              correctly if ncopies is not one.  */
8689           vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
8690                                          ncopies, initial_def,
8691                                          &vec_initial_defs);
8692         }
8693       else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
8694                || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
8695         /* Fill the initial vector with the initial scalar value.  */
8696         vec_initial_def
8697           = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
8698                                            initial_def, initial_def);
8699       else
8700         {
8701           if (ncopies == 1)
8702             vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8703           if (!reduc_info->reduc_initial_values.is_empty ())
8704             {
8705               initial_def = reduc_info->reduc_initial_values[0];
8706               code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8707               tree neutral_op
8708                 = neutral_op_for_reduction (TREE_TYPE (initial_def),
8709                                             code, initial_def);
8710               gcc_assert (neutral_op);
8711               /* Try to simplify the vector initialization by applying an
8712                  adjustment after the reduction has been performed.  */
8713               if (!reduc_info->reused_accumulator
8714                   && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8715                   && !operand_equal_p (neutral_op, initial_def))
8716                 {
8717                   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
8718                     = initial_def;
8719                   initial_def = neutral_op;
8720                 }
8721               vec_initial_def
8722                 = get_initial_def_for_reduction (loop_vinfo, reduc_info,
8723                                                  initial_def, neutral_op);
8724             }
8725         }
8726     }
8727
8728   if (vec_initial_def)
8729     {
8730       vec_initial_defs.create (ncopies);
8731       for (i = 0; i < ncopies; ++i)
8732         vec_initial_defs.quick_push (vec_initial_def);
8733     }
8734
8735   if (auto *accumulator = reduc_info->reused_accumulator)
8736     {
8737       tree def = accumulator->reduc_input;
8738       if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8739         {
8740           unsigned int nreduc;
8741           bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
8742                                             (TREE_TYPE (def)),
8743                                           TYPE_VECTOR_SUBPARTS (vectype_out),
8744                                           &nreduc);
8745           gcc_assert (res);
8746           gimple_seq stmts = NULL;
8747           /* Reduce the single vector to a smaller one.  */
8748           if (nreduc != 1)
8749             {
8750               /* Perform the reduction in the appropriate type.  */
8751               tree rvectype = vectype_out;
8752               if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
8753                                               TREE_TYPE (TREE_TYPE (def))))
8754                 rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
8755                                               TYPE_VECTOR_SUBPARTS
8756                                                 (vectype_out));
8757               def = vect_create_partial_epilog (def, rvectype,
8758                                                 STMT_VINFO_REDUC_CODE
8759                                                   (reduc_info),
8760                                                 &stmts);
8761             }
8762           /* The epilogue loop might use a different vector mode, like
8763              VNx2DI vs. V2DI.  */
8764           if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
8765             {
8766               tree reduc_type = build_vector_type_for_mode
8767                 (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
8768               def = gimple_convert (&stmts, reduc_type, def);
8769             }
8770           /* Adjust the input so we pick up the partially reduced value
8771              for the skip edge in vect_create_epilog_for_reduction.  */
8772           accumulator->reduc_input = def;
8773           /* And the reduction could be carried out using a different sign.  */
8774           if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8775             def = gimple_convert (&stmts, vectype_out, def);
8776           if (loop_vinfo->main_loop_edge)
8777             {
8778               /* While we'd like to insert on the edge this will split
8779                  blocks and disturb bookkeeping, we also will eventually
8780                  need this on the skip edge.  Rely on sinking to
8781                  fixup optimal placement and insert in the pred.  */
8782               gimple_stmt_iterator gsi
8783                 = gsi_last_bb (loop_vinfo->main_loop_edge->src);
8784               /* Insert before a cond that eventually skips the
8785                  epilogue.  */
8786               if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
8787                 gsi_prev (&gsi);
8788               gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
8789             }
8790           else
8791             gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
8792                                               stmts);
8793         }
8794       if (loop_vinfo->main_loop_edge)
8795         vec_initial_defs[0]
8796           = vect_get_main_loop_result (loop_vinfo, def,
8797                                        vec_initial_defs[0]);
8798       else
8799         vec_initial_defs.safe_push (def);
8800     }
8801
8802   /* Generate the reduction PHIs upfront.  */
8803   for (i = 0; i < vec_num; i++)
8804     {
8805       tree vec_init_def = vec_initial_defs[i];
8806       for (j = 0; j < ncopies; j++)
8807         {
8808           /* Create the reduction-phi that defines the reduction
8809              operand.  */
8810           gphi *new_phi = create_phi_node (vec_dest, loop->header);
8811
8812           /* Set the loop-entry arg of the reduction-phi.  */
8813           if (j != 0 && nested_cycle)
8814             vec_init_def = vec_initial_defs[j];
8815           add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
8816                        UNKNOWN_LOCATION);
8817
8818           /* The loop-latch arg is set in epilogue processing.  */
8819
8820           if (slp_node)
8821             slp_node->push_vec_def (new_phi);
8822           else
8823             {
8824               if (j == 0)
8825                 *vec_stmt = new_phi;
8826               STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8827             }
8828         }
8829     }
8830
8831   return true;
8832 }
8833
8834 /* Vectorizes LC PHIs.  */
8835
8836 bool
8837 vectorizable_lc_phi (loop_vec_info loop_vinfo,
8838                      stmt_vec_info stmt_info, gimple **vec_stmt,
8839                      slp_tree slp_node)
8840 {
8841   if (!loop_vinfo
8842       || !is_a <gphi *> (stmt_info->stmt)
8843       || gimple_phi_num_args (stmt_info->stmt) != 1)
8844     return false;
8845
8846   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
8847       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
8848     return false;
8849
8850   if (!vec_stmt) /* transformation not required.  */
8851     {
8852       /* Deal with copies from externs or constants that disguise as
8853          loop-closed PHI nodes (PR97886).  */
8854       if (slp_node
8855           && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
8856                                                 SLP_TREE_VECTYPE (slp_node)))
8857         {
8858           if (dump_enabled_p ())
8859             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8860                              "incompatible vector types for invariants\n");
8861           return false;
8862         }
8863       STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
8864       return true;
8865     }
8866
8867   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8868   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8869   basic_block bb = gimple_bb (stmt_info->stmt);
8870   edge e = single_pred_edge (bb);
8871   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8872   auto_vec<tree> vec_oprnds;
8873   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
8874                      !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
8875                      gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
8876   for (unsigned i = 0; i < vec_oprnds.length (); i++)
8877     {
8878       /* Create the vectorized LC PHI node.  */
8879       gphi *new_phi = create_phi_node (vec_dest, bb);
8880       add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
8881       if (slp_node)
8882         slp_node->push_vec_def (new_phi);
8883       else
8884         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8885     }
8886   if (!slp_node)
8887     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8888
8889   return true;
8890 }
8891
8892 /* Vectorizes PHIs.  */
8893
8894 bool
8895 vectorizable_phi (vec_info *,
8896                   stmt_vec_info stmt_info, gimple **vec_stmt,
8897                   slp_tree slp_node, stmt_vector_for_cost *cost_vec)
8898 {
8899   if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
8900     return false;
8901
8902   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
8903     return false;
8904
8905   tree vectype = SLP_TREE_VECTYPE (slp_node);
8906
8907   if (!vec_stmt) /* transformation not required.  */
8908     {
8909       slp_tree child;
8910       unsigned i;
8911       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
8912         if (!child)
8913           {
8914             if (dump_enabled_p ())
8915               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8916                                "PHI node with unvectorized backedge def\n");
8917             return false;
8918           }
8919         else if (!vect_maybe_update_slp_op_vectype (child, vectype))
8920           {
8921             if (dump_enabled_p ())
8922               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8923                                "incompatible vector types for invariants\n");
8924             return false;
8925           }
8926         else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8927                  && !useless_type_conversion_p (vectype,
8928                                                 SLP_TREE_VECTYPE (child)))
8929           {
8930             /* With bools we can have mask and non-mask precision vectors
8931                or different non-mask precisions.  while pattern recog is
8932                supposed to guarantee consistency here bugs in it can cause
8933                mismatches (PR103489 and PR103800 for example).
8934                Deal with them here instead of ICEing later.  */
8935             if (dump_enabled_p ())
8936               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8937                                "incompatible vector type setup from "
8938                                "bool pattern detection\n");
8939             return false;
8940           }
8941
8942       /* For single-argument PHIs assume coalescing which means zero cost
8943          for the scalar and the vector PHIs.  This avoids artificially
8944          favoring the vector path (but may pessimize it in some cases).  */
8945       if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
8946         record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
8947                           vector_stmt, stmt_info, vectype, 0, vect_body);
8948       STMT_VINFO_TYPE (stmt_info) = phi_info_type;
8949       return true;
8950     }
8951
8952   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8953   basic_block bb = gimple_bb (stmt_info->stmt);
8954   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8955   auto_vec<gphi *> new_phis;
8956   for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
8957     {
8958       slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8959
8960       /* Skip not yet vectorized defs.  */
8961       if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8962           && SLP_TREE_VEC_DEFS (child).is_empty ())
8963         continue;
8964
8965       auto_vec<tree> vec_oprnds;
8966       vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
8967       if (!new_phis.exists ())
8968         {
8969           new_phis.create (vec_oprnds.length ());
8970           for (unsigned j = 0; j < vec_oprnds.length (); j++)
8971             {
8972               /* Create the vectorized LC PHI node.  */
8973               new_phis.quick_push (create_phi_node (vec_dest, bb));
8974               slp_node->push_vec_def (new_phis[j]);
8975             }
8976         }
8977       edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
8978       for (unsigned j = 0; j < vec_oprnds.length (); j++)
8979         add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
8980     }
8981   /* We should have at least one already vectorized child.  */
8982   gcc_assert (new_phis.exists ());
8983
8984   return true;
8985 }
8986
8987 /* Vectorizes first order recurrences.  An overview of the transformation
8988    is described below. Suppose we have the following loop.
8989
8990      int t = 0;
8991      for (int i = 0; i < n; ++i)
8992        {
8993          b[i] = a[i] - t;
8994          t = a[i];
8995        }
8996
8997    There is a first-order recurrence on 'a'. For this loop, the scalar IR
8998    looks (simplified) like:
8999
9000     scalar.preheader:
9001       init = 0;
9002
9003     scalar.body:
9004       i = PHI <0(scalar.preheader), i+1(scalar.body)>
9005       _2 = PHI <(init(scalar.preheader), <_1(scalar.body)>
9006       _1 = a[i]
9007       b[i] = _1 - _2
9008       if (i < n) goto scalar.body
9009
9010    In this example, _2 is a recurrence because it's value depends on the
9011    previous iteration.  We vectorize this as (VF = 4)
9012
9013     vector.preheader:
9014       vect_init = vect_cst(..., ..., ..., 0)
9015
9016     vector.body
9017       i = PHI <0(vector.preheader), i+4(vector.body)>
9018       vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)>
9019       vect_2 = a[i, i+1, i+2, i+3];
9020       vect_3 = vec_perm (vect_1, vect_2, { 3, 4, 5, 6 })
9021       b[i, i+1, i+2, i+3] = vect_2 - vect_3
9022       if (..) goto vector.body
9023
9024    In this function, vectorizable_recurr, we code generate both the
9025    vector PHI node and the permute since those together compute the
9026    vectorized value of the scalar PHI.  We do not yet have the
9027    backedge value to fill in there nor into the vec_perm.  Those
9028    are filled in maybe_set_vectorized_backedge_value and
9029    vect_schedule_scc.
9030
9031    TODO:  Since the scalar loop does not have a use of the recurrence
9032    outside of the loop the natural way to implement peeling via
9033    vectorizing the live value doesn't work.  For now peeling of loops
9034    with a recurrence is not implemented.  For SLP the supported cases
9035    are restricted to those requiring a single vector recurrence PHI.  */
9036
9037 bool
9038 vectorizable_recurr (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
9039                      gimple **vec_stmt, slp_tree slp_node,
9040                      stmt_vector_for_cost *cost_vec)
9041 {
9042   if (!loop_vinfo || !is_a<gphi *> (stmt_info->stmt))
9043     return false;
9044
9045   gphi *phi = as_a<gphi *> (stmt_info->stmt);
9046
9047   /* So far we only support first-order recurrence auto-vectorization.  */
9048   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
9049     return false;
9050
9051   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9052   unsigned ncopies;
9053   if (slp_node)
9054     ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
9055   else
9056     ncopies = vect_get_num_copies (loop_vinfo, vectype);
9057   poly_int64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9058   unsigned dist = slp_node ? SLP_TREE_LANES (slp_node) : 1;
9059   /* We need to be able to make progress with a single vector.  */
9060   if (maybe_gt (dist * 2, nunits))
9061     {
9062       if (dump_enabled_p ())
9063         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9064                          "first order recurrence exceeds half of "
9065                          "a vector\n");
9066       return false;
9067     }
9068
9069   /* First-order recurrence autovectorization needs to handle permutation
9070      with indices = [nunits-1, nunits, nunits+1, ...].  */
9071   vec_perm_builder sel (nunits, 1, 3);
9072   for (int i = 0; i < 3; ++i)
9073     sel.quick_push (nunits - dist + i);
9074   vec_perm_indices indices (sel, 2, nunits);
9075
9076   if (!vec_stmt) /* transformation not required.  */
9077     {
9078       if (!can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
9079                                  indices))
9080         return false;
9081
9082       if (slp_node)
9083         {
9084           /* We eventually need to set a vector type on invariant
9085              arguments.  */
9086           unsigned j;
9087           slp_tree child;
9088           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9089             if (!vect_maybe_update_slp_op_vectype
9090                   (child, SLP_TREE_VECTYPE (slp_node)))
9091               {
9092                 if (dump_enabled_p ())
9093                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9094                                    "incompatible vector types for "
9095                                    "invariants\n");
9096                 return false;
9097               }
9098         }
9099       /* The recurrence costs the initialization vector and one permute
9100          for each copy.  */
9101       unsigned prologue_cost = record_stmt_cost (cost_vec, 1, scalar_to_vec,
9102                                                  stmt_info, 0, vect_prologue);
9103       unsigned inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9104                                                stmt_info, 0, vect_body);
9105       if (dump_enabled_p ())
9106         dump_printf_loc (MSG_NOTE, vect_location,
9107                          "vectorizable_recurr: inside_cost = %d, "
9108                          "prologue_cost = %d .\n", inside_cost,
9109                          prologue_cost);
9110
9111       STMT_VINFO_TYPE (stmt_info) = recurr_info_type;
9112       return true;
9113     }
9114
9115   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
9116   basic_block bb = gimple_bb (phi);
9117   tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
9118   if (!useless_type_conversion_p (TREE_TYPE (vectype), TREE_TYPE (preheader)))
9119     {
9120       gimple_seq stmts = NULL;
9121       preheader = gimple_convert (&stmts, TREE_TYPE (vectype), preheader);
9122       gsi_insert_seq_on_edge_immediate (pe, stmts);
9123     }
9124   tree vec_init = build_vector_from_val (vectype, preheader);
9125   vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL);
9126
9127   /* Create the vectorized first-order PHI node.  */
9128   tree vec_dest = vect_get_new_vect_var (vectype,
9129                                          vect_simple_var, "vec_recur_");
9130   gphi *new_phi = create_phi_node (vec_dest, bb);
9131   add_phi_arg (new_phi, vec_init, pe, UNKNOWN_LOCATION);
9132
9133   /* Insert shuffles the first-order recurrence autovectorization.
9134        result = VEC_PERM <vec_recur, vect_1, index[nunits-1, nunits, ...]>.  */
9135   tree perm = vect_gen_perm_mask_checked (vectype, indices);
9136
9137   /* Insert the required permute after the latch definition.  The
9138      second and later operands are tentative and will be updated when we have
9139      vectorized the latch definition.  */
9140   edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
9141   gimple *latch_def = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, le));
9142   gimple_stmt_iterator gsi2 = gsi_for_stmt (latch_def);
9143   gsi_next (&gsi2);
9144
9145   for (unsigned i = 0; i < ncopies; ++i)
9146     {
9147       vec_dest = make_ssa_name (vectype);
9148       gassign *vperm
9149           = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
9150                                  i == 0 ? gimple_phi_result (new_phi) : NULL,
9151                                  NULL, perm);
9152       vect_finish_stmt_generation (loop_vinfo, stmt_info, vperm, &gsi2);
9153
9154       if (slp_node)
9155         slp_node->push_vec_def (vperm);
9156       else
9157         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (vperm);
9158     }
9159
9160   if (!slp_node)
9161     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
9162   return true;
9163 }
9164
9165 /* Return true if VECTYPE represents a vector that requires lowering
9166    by the vector lowering pass.  */
9167
9168 bool
9169 vect_emulated_vector_p (tree vectype)
9170 {
9171   return (!VECTOR_MODE_P (TYPE_MODE (vectype))
9172           && (!VECTOR_BOOLEAN_TYPE_P (vectype)
9173               || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
9174 }
9175
9176 /* Return true if we can emulate CODE on an integer mode representation
9177    of a vector.  */
9178
9179 bool
9180 vect_can_vectorize_without_simd_p (tree_code code)
9181 {
9182   switch (code)
9183     {
9184     case PLUS_EXPR:
9185     case MINUS_EXPR:
9186     case NEGATE_EXPR:
9187     case BIT_AND_EXPR:
9188     case BIT_IOR_EXPR:
9189     case BIT_XOR_EXPR:
9190     case BIT_NOT_EXPR:
9191       return true;
9192
9193     default:
9194       return false;
9195     }
9196 }
9197
9198 /* Likewise, but taking a code_helper.  */
9199
9200 bool
9201 vect_can_vectorize_without_simd_p (code_helper code)
9202 {
9203   return (code.is_tree_code ()
9204           && vect_can_vectorize_without_simd_p (tree_code (code)));
9205 }
9206
9207 /* Create vector init for vectorized iv.  */
9208 static tree
9209 vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9210                                tree step_expr, poly_uint64 nunits,
9211                                tree vectype,
9212                                enum vect_induction_op_type induction_type)
9213 {
9214   unsigned HOST_WIDE_INT const_nunits;
9215   tree vec_shift, vec_init, new_name;
9216   unsigned i;
9217   tree itype = TREE_TYPE (vectype);
9218
9219   /* iv_loop is the loop to be vectorized. Create:
9220      vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr).  */
9221   new_name = gimple_convert (stmts, itype, init_expr);
9222   switch (induction_type)
9223     {
9224     case vect_step_op_shr:
9225     case vect_step_op_shl:
9226       /* Build the Initial value from shift_expr.  */
9227       vec_init = gimple_build_vector_from_val (stmts,
9228                                                vectype,
9229                                                new_name);
9230       vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
9231                                 build_zero_cst (itype), step_expr);
9232       vec_init = gimple_build (stmts,
9233                                (induction_type == vect_step_op_shr
9234                                 ? RSHIFT_EXPR : LSHIFT_EXPR),
9235                                vectype, vec_init, vec_shift);
9236       break;
9237
9238     case vect_step_op_neg:
9239       {
9240         vec_init = gimple_build_vector_from_val (stmts,
9241                                                  vectype,
9242                                                  new_name);
9243         tree vec_neg = gimple_build (stmts, NEGATE_EXPR,
9244                                      vectype, vec_init);
9245         /* The encoding has 2 interleaved stepped patterns.  */
9246         vec_perm_builder sel (nunits, 2, 3);
9247         sel.quick_grow (6);
9248         for (i = 0; i < 3; i++)
9249           {
9250             sel[2 * i] = i;
9251             sel[2 * i + 1] = i + nunits;
9252           }
9253         vec_perm_indices indices (sel, 2, nunits);
9254         /* Don't use vect_gen_perm_mask_checked since can_vec_perm_const_p may
9255            fail when vec_init is const vector. In that situation vec_perm is not
9256            really needed.  */
9257         tree perm_mask_even
9258           = vect_gen_perm_mask_any (vectype, indices);
9259         vec_init = gimple_build (stmts, VEC_PERM_EXPR,
9260                                  vectype,
9261                                  vec_init, vec_neg,
9262                                  perm_mask_even);
9263       }
9264       break;
9265
9266     case vect_step_op_mul:
9267       {
9268         /* Use unsigned mult to avoid UD integer overflow.  */
9269         gcc_assert (nunits.is_constant (&const_nunits));
9270         tree utype = unsigned_type_for (itype);
9271         tree uvectype = build_vector_type (utype,
9272                                            TYPE_VECTOR_SUBPARTS (vectype));
9273         new_name = gimple_convert (stmts, utype, new_name);
9274         vec_init = gimple_build_vector_from_val (stmts,
9275                                                  uvectype,
9276                                                  new_name);
9277         tree_vector_builder elts (uvectype, const_nunits, 1);
9278         tree elt_step = build_one_cst (utype);
9279
9280         elts.quick_push (elt_step);
9281         for (i = 1; i < const_nunits; i++)
9282           {
9283             /* Create: new_name_i = new_name + step_expr.  */
9284             elt_step = gimple_build (stmts, MULT_EXPR,
9285                                      utype, elt_step, step_expr);
9286             elts.quick_push (elt_step);
9287           }
9288         /* Create a vector from [new_name_0, new_name_1, ...,
9289            new_name_nunits-1].  */
9290         tree vec_mul = gimple_build_vector (stmts, &elts);
9291         vec_init = gimple_build (stmts, MULT_EXPR, uvectype,
9292                                  vec_init, vec_mul);
9293         vec_init = gimple_convert (stmts, vectype, vec_init);
9294       }
9295       break;
9296
9297     default:
9298       gcc_unreachable ();
9299     }
9300
9301   return vec_init;
9302 }
9303
9304 /* Peel init_expr by skip_niter for induction_type.  */
9305 tree
9306 vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9307                              tree skip_niters, tree step_expr,
9308                              enum vect_induction_op_type induction_type)
9309 {
9310   gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST);
9311   tree type = TREE_TYPE (init_expr);
9312   unsigned prec = TYPE_PRECISION (type);
9313   switch (induction_type)
9314     {
9315     case vect_step_op_neg:
9316       if (TREE_INT_CST_LOW (skip_niters) % 2)
9317         init_expr = gimple_build (stmts, NEGATE_EXPR, type, init_expr);
9318       /* else no change.  */
9319       break;
9320
9321     case vect_step_op_shr:
9322     case vect_step_op_shl:
9323       skip_niters = gimple_convert (stmts, type, skip_niters);
9324       step_expr = gimple_build (stmts, MULT_EXPR, type, step_expr, skip_niters);
9325       /* When shift mount >= precision, need to avoid UD.
9326          In the original loop, there's no UD, and according to semantic,
9327          init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr.  */
9328       if (!tree_fits_uhwi_p (step_expr)
9329           || tree_to_uhwi (step_expr) >= prec)
9330         {
9331           if (induction_type == vect_step_op_shl
9332               || TYPE_UNSIGNED (type))
9333             init_expr = build_zero_cst (type);
9334           else
9335             init_expr = gimple_build (stmts, RSHIFT_EXPR, type,
9336                                       init_expr,
9337                                       wide_int_to_tree (type, prec - 1));
9338         }
9339       else
9340         init_expr = gimple_build (stmts, (induction_type == vect_step_op_shr
9341                                           ? RSHIFT_EXPR : LSHIFT_EXPR),
9342                                   type, init_expr, step_expr);
9343       break;
9344
9345     case vect_step_op_mul:
9346       {
9347         tree utype = unsigned_type_for (type);
9348         init_expr = gimple_convert (stmts, utype, init_expr);
9349         wide_int skipn = wi::to_wide (skip_niters);
9350         wide_int begin = wi::to_wide (step_expr);
9351         auto_mpz base, exp, mod, res;
9352         wi::to_mpz (begin, base, TYPE_SIGN (type));
9353         wi::to_mpz (skipn, exp, UNSIGNED);
9354         mpz_ui_pow_ui (mod, 2, TYPE_PRECISION (type));
9355         mpz_powm (res, base, exp, mod);
9356         begin = wi::from_mpz (type, res, TYPE_SIGN (type));
9357         tree mult_expr = wide_int_to_tree (utype, begin);
9358         init_expr = gimple_build (stmts, MULT_EXPR, utype,
9359                                   init_expr, mult_expr);
9360         init_expr = gimple_convert (stmts, type, init_expr);
9361       }
9362       break;
9363
9364     default:
9365       gcc_unreachable ();
9366     }
9367
9368   return init_expr;
9369 }
9370
9371 /* Create vector step for vectorized iv.  */
9372 static tree
9373 vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
9374                                poly_uint64 vf,
9375                                enum vect_induction_op_type induction_type)
9376 {
9377   tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
9378   tree new_name = NULL;
9379   /* Step should be pow (step, vf) for mult induction.  */
9380   if (induction_type == vect_step_op_mul)
9381     {
9382       gcc_assert (vf.is_constant ());
9383       wide_int begin = wi::to_wide (step_expr);
9384
9385       for (unsigned i = 0; i != vf.to_constant () - 1; i++)
9386         begin = wi::mul (begin, wi::to_wide (step_expr));
9387
9388       new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin);
9389     }
9390   else if (induction_type == vect_step_op_neg)
9391     /* Do nothing.  */
9392     ;
9393   else
9394     new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr),
9395                              expr, step_expr);
9396   return new_name;
9397 }
9398
9399 static tree
9400 vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
9401                                    stmt_vec_info stmt_info,
9402                                    tree new_name, tree vectype,
9403                                    enum vect_induction_op_type induction_type)
9404 {
9405   /* No step is needed for neg induction.  */
9406   if (induction_type == vect_step_op_neg)
9407     return NULL;
9408
9409   tree t = unshare_expr (new_name);
9410   gcc_assert (CONSTANT_CLASS_P (new_name)
9411               || TREE_CODE (new_name) == SSA_NAME);
9412   tree new_vec = build_vector_from_val (vectype, t);
9413   tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
9414                                     new_vec, vectype, NULL);
9415   return vec_step;
9416 }
9417
9418 /* Update vectorized iv with vect_step, induc_def is init.  */
9419 static tree
9420 vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
9421                           tree induc_def, tree vec_step,
9422                           enum vect_induction_op_type induction_type)
9423 {
9424   tree vec_def = induc_def;
9425   switch (induction_type)
9426     {
9427     case vect_step_op_mul:
9428       {
9429         /* Use unsigned mult to avoid UD integer overflow.  */
9430         tree uvectype
9431           = build_vector_type (unsigned_type_for (TREE_TYPE (vectype)),
9432                                TYPE_VECTOR_SUBPARTS (vectype));
9433         vec_def = gimple_convert (stmts, uvectype, vec_def);
9434         vec_step = gimple_convert (stmts, uvectype, vec_step);
9435         vec_def = gimple_build (stmts, MULT_EXPR, uvectype,
9436                                 vec_def, vec_step);
9437         vec_def = gimple_convert (stmts, vectype, vec_def);
9438       }
9439       break;
9440
9441     case vect_step_op_shr:
9442       vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype,
9443                               vec_def, vec_step);
9444       break;
9445
9446     case vect_step_op_shl:
9447       vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype,
9448                               vec_def, vec_step);
9449       break;
9450     case vect_step_op_neg:
9451       vec_def = induc_def;
9452       /* Do nothing.  */
9453       break;
9454     default:
9455       gcc_unreachable ();
9456     }
9457
9458   return vec_def;
9459
9460 }
9461
9462 /* Function vectorizable_induction
9463
9464    Check if STMT_INFO performs an nonlinear induction computation that can be
9465    vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
9466    a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
9467    basic block.
9468    Return true if STMT_INFO is vectorizable in this way.  */
9469
9470 static bool
9471 vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
9472                                   stmt_vec_info stmt_info,
9473                                   gimple **vec_stmt, slp_tree slp_node,
9474                                   stmt_vector_for_cost *cost_vec)
9475 {
9476   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9477   unsigned ncopies;
9478   bool nested_in_vect_loop = false;
9479   class loop *iv_loop;
9480   tree vec_def;
9481   edge pe = loop_preheader_edge (loop);
9482   basic_block new_bb;
9483   tree vec_init, vec_step;
9484   tree new_name;
9485   gimple *new_stmt;
9486   gphi *induction_phi;
9487   tree induc_def, vec_dest;
9488   tree init_expr, step_expr;
9489   tree niters_skip;
9490   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9491   unsigned i;
9492   gimple_stmt_iterator si;
9493
9494   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9495
9496   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9497   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9498   enum vect_induction_op_type induction_type
9499     = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9500
9501   gcc_assert (induction_type > vect_step_op_add);
9502
9503   if (slp_node)
9504     ncopies = 1;
9505   else
9506     ncopies = vect_get_num_copies (loop_vinfo, vectype);
9507   gcc_assert (ncopies >= 1);
9508
9509   /* FORNOW. Only handle nonlinear induction in the same loop.  */
9510   if (nested_in_vect_loop_p (loop, stmt_info))
9511     {
9512       if (dump_enabled_p ())
9513         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9514                          "nonlinear induction in nested loop.\n");
9515       return false;
9516     }
9517
9518   iv_loop = loop;
9519   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9520
9521   /* TODO: Support slp for nonlinear iv. There should be separate vector iv
9522      update for each iv and a permutation to generate wanted vector iv.  */
9523   if (slp_node)
9524     {
9525       if (dump_enabled_p ())
9526         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9527                          "SLP induction not supported for nonlinear"
9528                          " induction.\n");
9529       return false;
9530     }
9531
9532   if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
9533     {
9534       if (dump_enabled_p ())
9535         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9536                          "floating point nonlinear induction vectorization"
9537                          " not supported.\n");
9538       return false;
9539     }
9540
9541   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9542   init_expr = vect_phi_initial_value (phi);
9543   gcc_assert (step_expr != NULL_TREE && init_expr != NULL
9544               && TREE_CODE (step_expr) == INTEGER_CST);
9545   /* step_expr should be aligned with init_expr,
9546      .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used.  */
9547   step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
9548
9549   if (TREE_CODE (init_expr) == INTEGER_CST)
9550     init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
9551   else
9552     gcc_assert (tree_nop_conversion_p (TREE_TYPE (vectype),
9553                                        TREE_TYPE (init_expr)));
9554
9555   switch (induction_type)
9556     {
9557     case vect_step_op_neg:
9558       if (TREE_CODE (init_expr) != INTEGER_CST
9559           && TREE_CODE (init_expr) != REAL_CST)
9560         {
9561           /* Check for backend support of NEGATE_EXPR and vec_perm.  */
9562           if (!directly_supported_p (NEGATE_EXPR, vectype))
9563             return false;
9564
9565           /* The encoding has 2 interleaved stepped patterns.  */
9566           vec_perm_builder sel (nunits, 2, 3);
9567           machine_mode mode = TYPE_MODE (vectype);
9568           sel.quick_grow (6);
9569           for (i = 0; i < 3; i++)
9570             {
9571               sel[i * 2] = i;
9572               sel[i * 2 + 1] = i + nunits;
9573             }
9574           vec_perm_indices indices (sel, 2, nunits);
9575           if (!can_vec_perm_const_p (mode, mode, indices))
9576             return false;
9577         }
9578       break;
9579
9580     case vect_step_op_mul:
9581       {
9582         /* Check for backend support of MULT_EXPR.  */
9583         if (!directly_supported_p (MULT_EXPR, vectype))
9584           return false;
9585
9586         /* ?? How to construct vector step for variable number vector.
9587            [ 1, step, pow (step, 2), pow (step, 4), .. ].  */
9588         if (!vf.is_constant ())
9589           return false;
9590       }
9591       break;
9592
9593     case vect_step_op_shr:
9594       /* Check for backend support of RSHIFT_EXPR.  */
9595       if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
9596         return false;
9597
9598       /* Don't shift more than type precision to avoid UD.  */
9599       if (!tree_fits_uhwi_p (step_expr)
9600           || maybe_ge (nunits * tree_to_uhwi (step_expr),
9601                        TYPE_PRECISION (TREE_TYPE (init_expr))))
9602         return false;
9603       break;
9604
9605     case vect_step_op_shl:
9606       /* Check for backend support of RSHIFT_EXPR.  */
9607       if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
9608         return false;
9609
9610       /* Don't shift more than type precision to avoid UD.  */
9611       if (!tree_fits_uhwi_p (step_expr)
9612           || maybe_ge (nunits * tree_to_uhwi (step_expr),
9613                        TYPE_PRECISION (TREE_TYPE (init_expr))))
9614         return false;
9615
9616       break;
9617
9618     default:
9619       gcc_unreachable ();
9620     }
9621
9622   if (!vec_stmt) /* transformation not required.  */
9623     {
9624       unsigned inside_cost = 0, prologue_cost = 0;
9625       /* loop cost for vec_loop. Neg induction doesn't have any
9626          inside_cost.  */
9627       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9628                                       stmt_info, 0, vect_body);
9629
9630       /* loop cost for vec_loop. Neg induction doesn't have any
9631          inside_cost.  */
9632       if (induction_type == vect_step_op_neg)
9633         inside_cost = 0;
9634
9635       /* prologue cost for vec_init and vec_step.  */
9636       prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9637                                         stmt_info, 0, vect_prologue);
9638
9639       if (dump_enabled_p ())
9640         dump_printf_loc (MSG_NOTE, vect_location,
9641                          "vect_model_induction_cost: inside_cost = %d, "
9642                          "prologue_cost = %d. \n", inside_cost,
9643                          prologue_cost);
9644
9645       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9646       DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
9647       return true;
9648     }
9649
9650   /* Transform.  */
9651
9652   /* Compute a vector variable, initialized with the first VF values of
9653      the induction variable.  E.g., for an iv with IV_PHI='X' and
9654      evolution S, for a vector of 4 units, we want to compute:
9655      [X, X + S, X + 2*S, X + 3*S].  */
9656
9657   if (dump_enabled_p ())
9658     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9659
9660   pe = loop_preheader_edge (iv_loop);
9661   /* Find the first insertion point in the BB.  */
9662   basic_block bb = gimple_bb (phi);
9663   si = gsi_after_labels (bb);
9664
9665   gimple_seq stmts = NULL;
9666
9667   niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9668   /* If we are using the loop mask to "peel" for alignment then we need
9669      to adjust the start value here.  */
9670   if (niters_skip != NULL_TREE)
9671     init_expr = vect_peel_nonlinear_iv_init (&stmts, init_expr, niters_skip,
9672                                              step_expr, induction_type);
9673
9674   vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr,
9675                                             step_expr, nunits, vectype,
9676                                             induction_type);
9677   if (stmts)
9678     {
9679       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9680       gcc_assert (!new_bb);
9681     }
9682
9683   stmts = NULL;
9684   new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9685                                             vf, induction_type);
9686   if (stmts)
9687     {
9688       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9689       gcc_assert (!new_bb);
9690     }
9691
9692   vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9693                                                 new_name, vectype,
9694                                                 induction_type);
9695   /* Create the following def-use cycle:
9696      loop prolog:
9697      vec_init = ...
9698      vec_step = ...
9699      loop:
9700      vec_iv = PHI <vec_init, vec_loop>
9701      ...
9702      STMT
9703      ...
9704      vec_loop = vec_iv + vec_step;  */
9705
9706   /* Create the induction-phi that defines the induction-operand.  */
9707   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
9708   induction_phi = create_phi_node (vec_dest, iv_loop->header);
9709   induc_def = PHI_RESULT (induction_phi);
9710
9711   /* Create the iv update inside the loop.  */
9712   stmts = NULL;
9713   vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9714                                       induc_def, vec_step,
9715                                       induction_type);
9716
9717   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9718   new_stmt = SSA_NAME_DEF_STMT (vec_def);
9719
9720   /* Set the arguments of the phi node:  */
9721   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9722   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9723                UNKNOWN_LOCATION);
9724
9725   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
9726   *vec_stmt = induction_phi;
9727
9728   /* In case that vectorization factor (VF) is bigger than the number
9729      of elements that we can fit in a vectype (nunits), we have to generate
9730      more than one vector stmt - i.e - we need to "unroll" the
9731      vector stmt by a factor VF/nunits.  For more details see documentation
9732      in vectorizable_operation.  */
9733
9734   if (ncopies > 1)
9735     {
9736       stmts = NULL;
9737       /* FORNOW. This restriction should be relaxed.  */
9738       gcc_assert (!nested_in_vect_loop);
9739
9740       new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9741                                                 nunits, induction_type);
9742
9743       vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9744                                                     new_name, vectype,
9745                                                     induction_type);
9746       vec_def = induc_def;
9747       for (i = 1; i < ncopies; i++)
9748         {
9749           /* vec_i = vec_prev + vec_step.  */
9750           stmts = NULL;
9751           vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9752                                               vec_def, vec_step,
9753                                               induction_type);
9754           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9755           new_stmt = SSA_NAME_DEF_STMT (vec_def);
9756           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
9757         }
9758     }
9759
9760   if (dump_enabled_p ())
9761     dump_printf_loc (MSG_NOTE, vect_location,
9762                      "transform induction: created def-use cycle: %G%G",
9763                      (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
9764
9765   return true;
9766 }
9767
9768 /* Function vectorizable_induction
9769
9770    Check if STMT_INFO performs an induction computation that can be vectorized.
9771    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
9772    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
9773    Return true if STMT_INFO is vectorizable in this way.  */
9774
9775 bool
9776 vectorizable_induction (loop_vec_info loop_vinfo,
9777                         stmt_vec_info stmt_info,
9778                         gimple **vec_stmt, slp_tree slp_node,
9779                         stmt_vector_for_cost *cost_vec)
9780 {
9781   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9782   unsigned ncopies;
9783   bool nested_in_vect_loop = false;
9784   class loop *iv_loop;
9785   tree vec_def;
9786   edge pe = loop_preheader_edge (loop);
9787   basic_block new_bb;
9788   tree new_vec, vec_init, vec_step, t;
9789   tree new_name;
9790   gimple *new_stmt;
9791   gphi *induction_phi;
9792   tree induc_def, vec_dest;
9793   tree init_expr, step_expr;
9794   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9795   unsigned i;
9796   tree expr;
9797   gimple_stmt_iterator si;
9798   enum vect_induction_op_type induction_type
9799     = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9800
9801   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9802   if (!phi)
9803     return false;
9804
9805   if (!STMT_VINFO_RELEVANT_P (stmt_info))
9806     return false;
9807
9808   /* Make sure it was recognized as induction computation.  */
9809   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
9810     return false;
9811
9812   /* Handle nonlinear induction in a separate place.  */
9813   if (induction_type != vect_step_op_add)
9814     return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
9815                                              vec_stmt, slp_node, cost_vec);
9816
9817   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9818   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9819
9820   if (slp_node)
9821     ncopies = 1;
9822   else
9823     ncopies = vect_get_num_copies (loop_vinfo, vectype);
9824   gcc_assert (ncopies >= 1);
9825
9826   /* FORNOW. These restrictions should be relaxed.  */
9827   if (nested_in_vect_loop_p (loop, stmt_info))
9828     {
9829       imm_use_iterator imm_iter;
9830       use_operand_p use_p;
9831       gimple *exit_phi;
9832       edge latch_e;
9833       tree loop_arg;
9834
9835       if (ncopies > 1)
9836         {
9837           if (dump_enabled_p ())
9838             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9839                              "multiple types in nested loop.\n");
9840           return false;
9841         }
9842
9843       exit_phi = NULL;
9844       latch_e = loop_latch_edge (loop->inner);
9845       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
9846       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
9847         {
9848           gimple *use_stmt = USE_STMT (use_p);
9849           if (is_gimple_debug (use_stmt))
9850             continue;
9851
9852           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
9853             {
9854               exit_phi = use_stmt;
9855               break;
9856             }
9857         }
9858       if (exit_phi)
9859         {
9860           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
9861           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
9862                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
9863             {
9864               if (dump_enabled_p ())
9865                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9866                                  "inner-loop induction only used outside "
9867                                  "of the outer vectorized loop.\n");
9868               return false;
9869             }
9870         }
9871
9872       nested_in_vect_loop = true;
9873       iv_loop = loop->inner;
9874     }
9875   else
9876     iv_loop = loop;
9877   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9878
9879   if (slp_node && !nunits.is_constant ())
9880     {
9881       /* The current SLP code creates the step value element-by-element.  */
9882       if (dump_enabled_p ())
9883         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9884                          "SLP induction not supported for variable-length"
9885                          " vectors.\n");
9886       return false;
9887     }
9888
9889   if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
9890     {
9891       if (dump_enabled_p ())
9892         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9893                          "floating point induction vectorization disabled\n");
9894       return false;
9895     }
9896
9897   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9898   gcc_assert (step_expr != NULL_TREE);
9899   tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
9900
9901   /* Check for backend support of PLUS/MINUS_EXPR. */
9902   if (!directly_supported_p (PLUS_EXPR, step_vectype)
9903       || !directly_supported_p (MINUS_EXPR, step_vectype))
9904     return false;
9905
9906   if (!vec_stmt) /* transformation not required.  */
9907     {
9908       unsigned inside_cost = 0, prologue_cost = 0;
9909       if (slp_node)
9910         {
9911           /* We eventually need to set a vector type on invariant
9912              arguments.  */
9913           unsigned j;
9914           slp_tree child;
9915           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9916             if (!vect_maybe_update_slp_op_vectype
9917                 (child, SLP_TREE_VECTYPE (slp_node)))
9918               {
9919                 if (dump_enabled_p ())
9920                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9921                                    "incompatible vector types for "
9922                                    "invariants\n");
9923                 return false;
9924               }
9925           /* loop cost for vec_loop.  */
9926           inside_cost
9927             = record_stmt_cost (cost_vec,
9928                                 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
9929                                 vector_stmt, stmt_info, 0, vect_body);
9930           /* prologue cost for vec_init (if not nested) and step.  */
9931           prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
9932                                             scalar_to_vec,
9933                                             stmt_info, 0, vect_prologue);
9934         }
9935       else /* if (!slp_node) */
9936         {
9937           /* loop cost for vec_loop.  */
9938           inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9939                                           stmt_info, 0, vect_body);
9940           /* prologue cost for vec_init and vec_step.  */
9941           prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9942                                             stmt_info, 0, vect_prologue);
9943         }
9944       if (dump_enabled_p ())
9945         dump_printf_loc (MSG_NOTE, vect_location,
9946                          "vect_model_induction_cost: inside_cost = %d, "
9947                          "prologue_cost = %d .\n", inside_cost,
9948                          prologue_cost);
9949
9950       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9951       DUMP_VECT_SCOPE ("vectorizable_induction");
9952       return true;
9953     }
9954
9955   /* Transform.  */
9956
9957   /* Compute a vector variable, initialized with the first VF values of
9958      the induction variable.  E.g., for an iv with IV_PHI='X' and
9959      evolution S, for a vector of 4 units, we want to compute:
9960      [X, X + S, X + 2*S, X + 3*S].  */
9961
9962   if (dump_enabled_p ())
9963     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9964
9965   pe = loop_preheader_edge (iv_loop);
9966   /* Find the first insertion point in the BB.  */
9967   basic_block bb = gimple_bb (phi);
9968   si = gsi_after_labels (bb);
9969
9970   /* For SLP induction we have to generate several IVs as for example
9971      with group size 3 we need
9972        [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
9973        [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2].  */
9974   if (slp_node)
9975     {
9976       /* Enforced above.  */
9977       unsigned int const_nunits = nunits.to_constant ();
9978
9979       /* The initial values are vectorized, but any lanes > group_size
9980          need adjustment.  */
9981       slp_tree init_node
9982         = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
9983
9984       /* Gather steps.  Since we do not vectorize inductions as
9985          cycles we have to reconstruct the step from SCEV data.  */
9986       unsigned group_size = SLP_TREE_LANES (slp_node);
9987       tree *steps = XALLOCAVEC (tree, group_size);
9988       tree *inits = XALLOCAVEC (tree, group_size);
9989       stmt_vec_info phi_info;
9990       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
9991         {
9992           steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
9993           if (!init_node)
9994             inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
9995                                            pe->dest_idx);
9996         }
9997
9998       /* Now generate the IVs.  */
9999       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10000       gcc_assert ((const_nunits * nvects) % group_size == 0);
10001       unsigned nivs;
10002       if (nested_in_vect_loop)
10003         nivs = nvects;
10004       else
10005         {
10006           /* Compute the number of distinct IVs we need.  First reduce
10007              group_size if it is a multiple of const_nunits so we get
10008              one IV for a group_size of 4 but const_nunits 2.  */
10009           unsigned group_sizep = group_size;
10010           if (group_sizep % const_nunits == 0)
10011             group_sizep = group_sizep / const_nunits;
10012           nivs = least_common_multiple (group_sizep,
10013                                         const_nunits) / const_nunits;
10014         }
10015       tree stept = TREE_TYPE (step_vectype);
10016       tree lupdate_mul = NULL_TREE;
10017       if (!nested_in_vect_loop)
10018         {
10019           /* The number of iterations covered in one vector iteration.  */
10020           unsigned lup_mul = (nvects * const_nunits) / group_size;
10021           lupdate_mul
10022             = build_vector_from_val (step_vectype,
10023                                      SCALAR_FLOAT_TYPE_P (stept)
10024                                      ? build_real_from_wide (stept, lup_mul,
10025                                                              UNSIGNED)
10026                                      : build_int_cstu (stept, lup_mul));
10027         }
10028       tree peel_mul = NULL_TREE;
10029       gimple_seq init_stmts = NULL;
10030       if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
10031         {
10032           if (SCALAR_FLOAT_TYPE_P (stept))
10033             peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
10034                                      LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
10035           else
10036             peel_mul = gimple_convert (&init_stmts, stept,
10037                                        LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
10038           peel_mul = gimple_build_vector_from_val (&init_stmts,
10039                                                    step_vectype, peel_mul);
10040         }
10041       unsigned ivn;
10042       auto_vec<tree> vec_steps;
10043       for (ivn = 0; ivn < nivs; ++ivn)
10044         {
10045           tree_vector_builder step_elts (step_vectype, const_nunits, 1);
10046           tree_vector_builder init_elts (vectype, const_nunits, 1);
10047           tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
10048           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
10049             {
10050               /* The scalar steps of the IVs.  */
10051               tree elt = steps[(ivn*const_nunits + eltn) % group_size];
10052               elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
10053               step_elts.quick_push (elt);
10054               if (!init_node)
10055                 {
10056                   /* The scalar inits of the IVs if not vectorized.  */
10057                   elt = inits[(ivn*const_nunits + eltn) % group_size];
10058                   if (!useless_type_conversion_p (TREE_TYPE (vectype),
10059                                                   TREE_TYPE (elt)))
10060                     elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
10061                                         TREE_TYPE (vectype), elt);
10062                   init_elts.quick_push (elt);
10063                 }
10064               /* The number of steps to add to the initial values.  */
10065               unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
10066               mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
10067                                    ? build_real_from_wide (stept,
10068                                                            mul_elt, UNSIGNED)
10069                                    : build_int_cstu (stept, mul_elt));
10070             }
10071           vec_step = gimple_build_vector (&init_stmts, &step_elts);
10072           vec_steps.safe_push (vec_step);
10073           tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
10074           if (peel_mul)
10075             step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
10076                                      step_mul, peel_mul);
10077           if (!init_node)
10078             vec_init = gimple_build_vector (&init_stmts, &init_elts);
10079
10080           /* Create the induction-phi that defines the induction-operand.  */
10081           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
10082                                             "vec_iv_");
10083           induction_phi = create_phi_node (vec_dest, iv_loop->header);
10084           induc_def = PHI_RESULT (induction_phi);
10085
10086           /* Create the iv update inside the loop  */
10087           tree up = vec_step;
10088           if (lupdate_mul)
10089             up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10090                                vec_step, lupdate_mul);
10091           gimple_seq stmts = NULL;
10092           vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10093           vec_def = gimple_build (&stmts,
10094                                   PLUS_EXPR, step_vectype, vec_def, up);
10095           vec_def = gimple_convert (&stmts, vectype, vec_def);
10096           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10097           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10098                        UNKNOWN_LOCATION);
10099
10100           if (init_node)
10101             vec_init = vect_get_slp_vect_def (init_node, ivn);
10102           if (!nested_in_vect_loop
10103               && !integer_zerop (step_mul))
10104             {
10105               vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
10106               up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10107                                  vec_step, step_mul);
10108               vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
10109                                       vec_def, up);
10110               vec_init = gimple_convert (&init_stmts, vectype, vec_def);
10111             }
10112
10113           /* Set the arguments of the phi node:  */
10114           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10115
10116           slp_node->push_vec_def (induction_phi);
10117         }
10118       if (!nested_in_vect_loop)
10119         {
10120           /* Fill up to the number of vectors we need for the whole group.  */
10121           nivs = least_common_multiple (group_size,
10122                                         const_nunits) / const_nunits;
10123           vec_steps.reserve (nivs-ivn);
10124           for (; ivn < nivs; ++ivn)
10125             {
10126               slp_node->push_vec_def (SLP_TREE_VEC_DEFS (slp_node)[0]);
10127               vec_steps.quick_push (vec_steps[0]);
10128             }
10129         }
10130
10131       /* Re-use IVs when we can.  We are generating further vector
10132          stmts by adding VF' * stride to the IVs generated above.  */
10133       if (ivn < nvects)
10134         {
10135           unsigned vfp
10136             = least_common_multiple (group_size, const_nunits) / group_size;
10137           tree lupdate_mul
10138             = build_vector_from_val (step_vectype,
10139                                      SCALAR_FLOAT_TYPE_P (stept)
10140                                      ? build_real_from_wide (stept,
10141                                                              vfp, UNSIGNED)
10142                                      : build_int_cstu (stept, vfp));
10143           for (; ivn < nvects; ++ivn)
10144             {
10145               gimple *iv
10146                 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (slp_node)[ivn - nivs]);
10147               tree def = gimple_get_lhs (iv);
10148               if (ivn < 2*nivs)
10149                 vec_steps[ivn - nivs]
10150                   = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10151                                   vec_steps[ivn - nivs], lupdate_mul);
10152               gimple_seq stmts = NULL;
10153               def = gimple_convert (&stmts, step_vectype, def);
10154               def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10155                                   def, vec_steps[ivn % nivs]);
10156               def = gimple_convert (&stmts, vectype, def);
10157               if (gimple_code (iv) == GIMPLE_PHI)
10158                 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10159               else
10160                 {
10161                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
10162                   gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
10163                 }
10164               slp_node->push_vec_def (def);
10165             }
10166         }
10167
10168       new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
10169       gcc_assert (!new_bb);
10170
10171       return true;
10172     }
10173
10174   init_expr = vect_phi_initial_value (phi);
10175
10176   gimple_seq stmts = NULL;
10177   if (!nested_in_vect_loop)
10178     {
10179       /* Convert the initial value to the IV update type.  */
10180       tree new_type = TREE_TYPE (step_expr);
10181       init_expr = gimple_convert (&stmts, new_type, init_expr);
10182
10183       /* If we are using the loop mask to "peel" for alignment then we need
10184          to adjust the start value here.  */
10185       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
10186       if (skip_niters != NULL_TREE)
10187         {
10188           if (FLOAT_TYPE_P (vectype))
10189             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
10190                                         skip_niters);
10191           else
10192             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
10193           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
10194                                          skip_niters, step_expr);
10195           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
10196                                     init_expr, skip_step);
10197         }
10198     }
10199
10200   if (stmts)
10201     {
10202       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10203       gcc_assert (!new_bb);
10204     }
10205
10206   /* Create the vector that holds the initial_value of the induction.  */
10207   if (nested_in_vect_loop)
10208     {
10209       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
10210          been created during vectorization of previous stmts.  We obtain it
10211          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
10212       auto_vec<tree> vec_inits;
10213       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
10214                                      init_expr, &vec_inits);
10215       vec_init = vec_inits[0];
10216       /* If the initial value is not of proper type, convert it.  */
10217       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
10218         {
10219           new_stmt
10220             = gimple_build_assign (vect_get_new_ssa_name (vectype,
10221                                                           vect_simple_var,
10222                                                           "vec_iv_"),
10223                                    VIEW_CONVERT_EXPR,
10224                                    build1 (VIEW_CONVERT_EXPR, vectype,
10225                                            vec_init));
10226           vec_init = gimple_assign_lhs (new_stmt);
10227           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
10228                                                  new_stmt);
10229           gcc_assert (!new_bb);
10230         }
10231     }
10232   else
10233     {
10234       /* iv_loop is the loop to be vectorized. Create:
10235          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
10236       stmts = NULL;
10237       new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
10238
10239       unsigned HOST_WIDE_INT const_nunits;
10240       if (nunits.is_constant (&const_nunits))
10241         {
10242           tree_vector_builder elts (step_vectype, const_nunits, 1);
10243           elts.quick_push (new_name);
10244           for (i = 1; i < const_nunits; i++)
10245             {
10246               /* Create: new_name_i = new_name + step_expr  */
10247               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
10248                                        new_name, step_expr);
10249               elts.quick_push (new_name);
10250             }
10251           /* Create a vector from [new_name_0, new_name_1, ...,
10252              new_name_nunits-1]  */
10253           vec_init = gimple_build_vector (&stmts, &elts);
10254         }
10255       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
10256         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
10257         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
10258                                  new_name, step_expr);
10259       else
10260         {
10261           /* Build:
10262                 [base, base, base, ...]
10263                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
10264           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
10265           gcc_assert (flag_associative_math);
10266           tree index = build_index_vector (step_vectype, 0, 1);
10267           tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10268                                                         new_name);
10269           tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10270                                                         step_expr);
10271           vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
10272           vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
10273                                    vec_init, step_vec);
10274           vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10275                                    vec_init, base_vec);
10276         }
10277       vec_init = gimple_convert (&stmts, vectype, vec_init);
10278
10279       if (stmts)
10280         {
10281           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10282           gcc_assert (!new_bb);
10283         }
10284     }
10285
10286
10287   /* Create the vector that holds the step of the induction.  */
10288   if (nested_in_vect_loop)
10289     /* iv_loop is nested in the loop to be vectorized. Generate:
10290        vec_step = [S, S, S, S]  */
10291     new_name = step_expr;
10292   else
10293     {
10294       /* iv_loop is the loop to be vectorized. Generate:
10295           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
10296       gimple_seq seq = NULL;
10297       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10298         {
10299           expr = build_int_cst (integer_type_node, vf);
10300           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10301         }
10302       else
10303         expr = build_int_cst (TREE_TYPE (step_expr), vf);
10304       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10305                                expr, step_expr);
10306       if (seq)
10307         {
10308           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10309           gcc_assert (!new_bb);
10310         }
10311     }
10312
10313   t = unshare_expr (new_name);
10314   gcc_assert (CONSTANT_CLASS_P (new_name)
10315               || TREE_CODE (new_name) == SSA_NAME);
10316   new_vec = build_vector_from_val (step_vectype, t);
10317   vec_step = vect_init_vector (loop_vinfo, stmt_info,
10318                                new_vec, step_vectype, NULL);
10319
10320
10321   /* Create the following def-use cycle:
10322      loop prolog:
10323          vec_init = ...
10324          vec_step = ...
10325      loop:
10326          vec_iv = PHI <vec_init, vec_loop>
10327          ...
10328          STMT
10329          ...
10330          vec_loop = vec_iv + vec_step;  */
10331
10332   /* Create the induction-phi that defines the induction-operand.  */
10333   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
10334   induction_phi = create_phi_node (vec_dest, iv_loop->header);
10335   induc_def = PHI_RESULT (induction_phi);
10336
10337   /* Create the iv update inside the loop  */
10338   stmts = NULL;
10339   vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10340   vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
10341   vec_def = gimple_convert (&stmts, vectype, vec_def);
10342   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10343   new_stmt = SSA_NAME_DEF_STMT (vec_def);
10344
10345   /* Set the arguments of the phi node:  */
10346   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10347   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10348                UNKNOWN_LOCATION);
10349
10350   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
10351   *vec_stmt = induction_phi;
10352
10353   /* In case that vectorization factor (VF) is bigger than the number
10354      of elements that we can fit in a vectype (nunits), we have to generate
10355      more than one vector stmt - i.e - we need to "unroll" the
10356      vector stmt by a factor VF/nunits.  For more details see documentation
10357      in vectorizable_operation.  */
10358
10359   if (ncopies > 1)
10360     {
10361       gimple_seq seq = NULL;
10362       /* FORNOW. This restriction should be relaxed.  */
10363       gcc_assert (!nested_in_vect_loop);
10364
10365       /* Create the vector that holds the step of the induction.  */
10366       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10367         {
10368           expr = build_int_cst (integer_type_node, nunits);
10369           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10370         }
10371       else
10372         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
10373       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10374                                expr, step_expr);
10375       if (seq)
10376         {
10377           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10378           gcc_assert (!new_bb);
10379         }
10380
10381       t = unshare_expr (new_name);
10382       gcc_assert (CONSTANT_CLASS_P (new_name)
10383                   || TREE_CODE (new_name) == SSA_NAME);
10384       new_vec = build_vector_from_val (step_vectype, t);
10385       vec_step = vect_init_vector (loop_vinfo, stmt_info,
10386                                    new_vec, step_vectype, NULL);
10387
10388       vec_def = induc_def;
10389       for (i = 1; i < ncopies + 1; i++)
10390         {
10391           /* vec_i = vec_prev + vec_step  */
10392           gimple_seq stmts = NULL;
10393           vec_def = gimple_convert (&stmts, step_vectype, vec_def);
10394           vec_def = gimple_build (&stmts,
10395                                   PLUS_EXPR, step_vectype, vec_def, vec_step);
10396           vec_def = gimple_convert (&stmts, vectype, vec_def);
10397
10398           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10399           if (i < ncopies)
10400             {
10401               new_stmt = SSA_NAME_DEF_STMT (vec_def);
10402               STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
10403             }
10404           else
10405             {
10406               /* vec_1 = vec_iv + (VF/n * S)
10407                  vec_2 = vec_1 + (VF/n * S)
10408                  ...
10409                  vec_n = vec_prev + (VF/n * S) = vec_iv + VF * S = vec_loop
10410
10411                  vec_n is used as vec_loop to save the large step register and
10412                  related operations.  */
10413               add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10414                            UNKNOWN_LOCATION);
10415             }
10416         }
10417     }
10418
10419   if (dump_enabled_p ())
10420     dump_printf_loc (MSG_NOTE, vect_location,
10421                      "transform induction: created def-use cycle: %G%G",
10422                      (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
10423
10424   return true;
10425 }
10426
10427 /* Function vectorizable_live_operation.
10428
10429    STMT_INFO computes a value that is used outside the loop.  Check if
10430    it can be supported.  */
10431
10432 bool
10433 vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
10434                              slp_tree slp_node, slp_instance slp_node_instance,
10435                              int slp_index, bool vec_stmt_p,
10436                              stmt_vector_for_cost *cost_vec)
10437 {
10438   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
10439   imm_use_iterator imm_iter;
10440   tree lhs, lhs_type, bitsize;
10441   tree vectype = (slp_node
10442                   ? SLP_TREE_VECTYPE (slp_node)
10443                   : STMT_VINFO_VECTYPE (stmt_info));
10444   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10445   int ncopies;
10446   gimple *use_stmt;
10447   auto_vec<tree> vec_oprnds;
10448   int vec_entry = 0;
10449   poly_uint64 vec_index = 0;
10450
10451   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
10452
10453   /* If a stmt of a reduction is live, vectorize it via
10454      vect_create_epilog_for_reduction.  vectorizable_reduction assessed
10455      validity so just trigger the transform here.  */
10456   if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
10457     {
10458       if (!vec_stmt_p)
10459         return true;
10460       if (slp_node)
10461         {
10462           /* For reduction chains the meta-info is attached to
10463              the group leader.  */
10464           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
10465             stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
10466           /* For SLP reductions we vectorize the epilogue for
10467              all involved stmts together.  */
10468           else if (slp_index != 0)
10469             return true;
10470         }
10471       stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
10472       gcc_assert (reduc_info->is_reduc_info);
10473       if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
10474           || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
10475         return true;
10476       vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
10477                                         slp_node_instance);
10478       return true;
10479     }
10480
10481   /* If STMT is not relevant and it is a simple assignment and its inputs are
10482      invariant then it can remain in place, unvectorized.  The original last
10483      scalar value that it computes will be used.  */
10484   if (!STMT_VINFO_RELEVANT_P (stmt_info))
10485     {
10486       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
10487       if (dump_enabled_p ())
10488         dump_printf_loc (MSG_NOTE, vect_location,
10489                          "statement is simple and uses invariant.  Leaving in "
10490                          "place.\n");
10491       return true;
10492     }
10493
10494   if (slp_node)
10495     ncopies = 1;
10496   else
10497     ncopies = vect_get_num_copies (loop_vinfo, vectype);
10498
10499   if (slp_node)
10500     {
10501       gcc_assert (slp_index >= 0);
10502
10503       /* Get the last occurrence of the scalar index from the concatenation of
10504          all the slp vectors. Calculate which slp vector it is and the index
10505          within.  */
10506       int num_scalar = SLP_TREE_LANES (slp_node);
10507       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10508       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
10509
10510       /* Calculate which vector contains the result, and which lane of
10511          that vector we need.  */
10512       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
10513         {
10514           if (dump_enabled_p ())
10515             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10516                              "Cannot determine which vector holds the"
10517                              " final result.\n");
10518           return false;
10519         }
10520     }
10521
10522   if (!vec_stmt_p)
10523     {
10524       /* No transformation required.  */
10525       if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
10526         {
10527           if (slp_node)
10528             {
10529               if (dump_enabled_p ())
10530                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10531                                  "can't operate on partial vectors "
10532                                  "because an SLP statement is live after "
10533                                  "the loop.\n");
10534               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10535             }
10536           else if (ncopies > 1)
10537             {
10538               if (dump_enabled_p ())
10539                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10540                                  "can't operate on partial vectors "
10541                                  "because ncopies is greater than 1.\n");
10542               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10543             }
10544           else
10545             {
10546               gcc_assert (ncopies == 1 && !slp_node);
10547               if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
10548                                                   OPTIMIZE_FOR_SPEED))
10549                 vect_record_loop_mask (loop_vinfo,
10550                                        &LOOP_VINFO_MASKS (loop_vinfo),
10551                                        1, vectype, NULL);
10552               else if (can_vec_extract_var_idx_p (
10553                          TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
10554                 vect_record_loop_len (loop_vinfo,
10555                                       &LOOP_VINFO_LENS (loop_vinfo),
10556                                       1, vectype, 1);
10557               else
10558                 {
10559                   if (dump_enabled_p ())
10560                     dump_printf_loc (
10561                       MSG_MISSED_OPTIMIZATION, vect_location,
10562                       "can't operate on partial vectors "
10563                       "because the target doesn't support extract "
10564                       "last reduction.\n");
10565                   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10566                 }
10567             }
10568         }
10569       /* ???  Enable for loop costing as well.  */
10570       if (!loop_vinfo)
10571         record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
10572                           0, vect_epilogue);
10573       return true;
10574     }
10575
10576   /* Use the lhs of the original scalar statement.  */
10577   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
10578   if (dump_enabled_p ())
10579     dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
10580                      "stmt %G", stmt);
10581
10582   lhs = gimple_get_lhs (stmt);
10583   lhs_type = TREE_TYPE (lhs);
10584
10585   bitsize = vector_element_bits_tree (vectype);
10586
10587   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
10588   tree vec_lhs, bitstart;
10589   gimple *vec_stmt;
10590   if (slp_node)
10591     {
10592       gcc_assert (!loop_vinfo
10593                   || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
10594                       && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)));
10595
10596       /* Get the correct slp vectorized stmt.  */
10597       vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
10598       vec_stmt = SSA_NAME_DEF_STMT (vec_lhs);
10599
10600       /* Get entry to use.  */
10601       bitstart = bitsize_int (vec_index);
10602       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
10603     }
10604   else
10605     {
10606       /* For multiple copies, get the last copy.  */
10607       vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
10608       vec_lhs = gimple_get_lhs (vec_stmt);
10609
10610       /* Get the last lane in the vector.  */
10611       bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
10612     }
10613
10614   if (loop_vinfo)
10615     {
10616       /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
10617          requirement, insert one phi node for it.  It looks like:
10618            loop;
10619          BB:
10620            # lhs' = PHI <lhs>
10621          ==>
10622            loop;
10623          BB:
10624            # vec_lhs' = PHI <vec_lhs>
10625            new_tree = lane_extract <vec_lhs', ...>;
10626            lhs' = new_tree;  */
10627
10628       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10629       basic_block exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
10630       gcc_assert (single_pred_p (exit_bb));
10631
10632       tree vec_lhs_phi = copy_ssa_name (vec_lhs);
10633       gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
10634       SET_PHI_ARG_DEF (phi, LOOP_VINFO_IV_EXIT (loop_vinfo)->dest_idx, vec_lhs);
10635
10636       gimple_seq stmts = NULL;
10637       tree new_tree;
10638       if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
10639         {
10640           /* Emit:
10641
10642                SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
10643
10644              where VEC_LHS is the vectorized live-out result and MASK is
10645              the loop mask for the final iteration.  */
10646           gcc_assert (ncopies == 1 && !slp_node);
10647           gimple_seq tem = NULL;
10648           gimple_stmt_iterator gsi = gsi_last (tem);
10649           tree len
10650             = vect_get_loop_len (loop_vinfo, &gsi,
10651                                  &LOOP_VINFO_LENS (loop_vinfo),
10652                                  1, vectype, 0, 0);
10653
10654           /* BIAS - 1.  */
10655           signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
10656           tree bias_minus_one
10657             = int_const_binop (MINUS_EXPR,
10658                                build_int_cst (TREE_TYPE (len), biasval),
10659                                build_one_cst (TREE_TYPE (len)));
10660
10661           /* LAST_INDEX = LEN + (BIAS - 1).  */
10662           tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
10663                                           len, bias_minus_one);
10664
10665           /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>.  */
10666           tree scalar_res
10667             = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
10668                             vec_lhs_phi, last_index);
10669
10670           /* Convert the extracted vector element to the scalar type.  */
10671           new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10672         }
10673       else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
10674         {
10675           /* Emit:
10676
10677                SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
10678
10679              where VEC_LHS is the vectorized live-out result and MASK is
10680              the loop mask for the final iteration.  */
10681           gcc_assert (ncopies == 1 && !slp_node);
10682           tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
10683           gimple_seq tem = NULL;
10684           gimple_stmt_iterator gsi = gsi_last (tem);
10685           tree mask = vect_get_loop_mask (loop_vinfo, &gsi,
10686                                           &LOOP_VINFO_MASKS (loop_vinfo),
10687                                           1, vectype, 0);
10688           gimple_seq_add_seq (&stmts, tem);
10689           tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
10690                                           mask, vec_lhs_phi);
10691
10692           /* Convert the extracted vector element to the scalar type.  */
10693           new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10694         }
10695       else
10696         {
10697           tree bftype = TREE_TYPE (vectype);
10698           if (VECTOR_BOOLEAN_TYPE_P (vectype))
10699             bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10700           new_tree = build3 (BIT_FIELD_REF, bftype,
10701                              vec_lhs_phi, bitsize, bitstart);
10702           new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10703                                            &stmts, true, NULL_TREE);
10704         }
10705
10706       gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
10707       if (stmts)
10708         gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
10709
10710       /* Remove existing phis that copy from lhs and create copies
10711          from new_tree.  */
10712       gimple_stmt_iterator gsi;
10713       for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi);)
10714         {
10715           gimple *phi = gsi_stmt (gsi);
10716           if ((gimple_phi_arg_def (phi, 0) == lhs))
10717             {
10718               remove_phi_node (&gsi, false);
10719               tree lhs_phi = gimple_phi_result (phi);
10720               gimple *copy = gimple_build_assign (lhs_phi, new_tree);
10721               gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
10722             }
10723           else
10724             gsi_next (&gsi);
10725         }
10726
10727       /* There a no further out-of-loop uses of lhs by LC-SSA construction.  */
10728       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10729         gcc_assert (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)));
10730     }
10731   else
10732     {
10733       /* For basic-block vectorization simply insert the lane-extraction.  */
10734       tree bftype = TREE_TYPE (vectype);
10735       if (VECTOR_BOOLEAN_TYPE_P (vectype))
10736         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10737       tree new_tree = build3 (BIT_FIELD_REF, bftype,
10738                               vec_lhs, bitsize, bitstart);
10739       gimple_seq stmts = NULL;
10740       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10741                                        &stmts, true, NULL_TREE);
10742       if (TREE_CODE (new_tree) == SSA_NAME
10743           && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
10744         SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
10745       if (is_a <gphi *> (vec_stmt))
10746         {
10747           gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
10748           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10749         }
10750       else
10751         {
10752           gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
10753           gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
10754         }
10755
10756       /* Replace use of lhs with newly computed result.  If the use stmt is a
10757          single arg PHI, just replace all uses of PHI result.  It's necessary
10758          because lcssa PHI defining lhs may be before newly inserted stmt.  */
10759       use_operand_p use_p;
10760       stmt_vec_info use_stmt_info;
10761       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10762         if (!is_gimple_debug (use_stmt)
10763             && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
10764                 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
10765           {
10766             /* ???  This can happen when the live lane ends up being
10767                rooted in a vector construction code-generated by an
10768                external SLP node (and code-generation for that already
10769                happened).  See gcc.dg/vect/bb-slp-47.c.
10770                Doing this is what would happen if that vector CTOR
10771                were not code-generated yet so it is not too bad.
10772                ???  In fact we'd likely want to avoid this situation
10773                in the first place.  */
10774             if (TREE_CODE (new_tree) == SSA_NAME
10775                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10776                 && gimple_code (use_stmt) != GIMPLE_PHI
10777                 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
10778                                                 use_stmt))
10779               {
10780                 if (dump_enabled_p ())
10781                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10782                                    "Using original scalar computation for "
10783                                    "live lane because use preceeds vector "
10784                                    "def\n");
10785                 continue;
10786               }
10787             /* ???  It can also happen that we end up pulling a def into
10788                a loop where replacing out-of-loop uses would require
10789                a new LC SSA PHI node.  Retain the original scalar in
10790                those cases as well.  PR98064.  */
10791             if (TREE_CODE (new_tree) == SSA_NAME
10792                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10793                 && (gimple_bb (use_stmt)->loop_father
10794                     != gimple_bb (vec_stmt)->loop_father)
10795                 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
10796                                         gimple_bb (use_stmt)->loop_father))
10797               {
10798                 if (dump_enabled_p ())
10799                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10800                                    "Using original scalar computation for "
10801                                    "live lane because there is an out-of-loop "
10802                                    "definition for it\n");
10803                 continue;
10804               }
10805             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10806               SET_USE (use_p, new_tree);
10807             update_stmt (use_stmt);
10808           }
10809     }
10810
10811   return true;
10812 }
10813
10814 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
10815
10816 static void
10817 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
10818 {
10819   ssa_op_iter op_iter;
10820   imm_use_iterator imm_iter;
10821   def_operand_p def_p;
10822   gimple *ustmt;
10823
10824   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
10825     {
10826       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
10827         {
10828           basic_block bb;
10829
10830           if (!is_gimple_debug (ustmt))
10831             continue;
10832
10833           bb = gimple_bb (ustmt);
10834
10835           if (!flow_bb_inside_loop_p (loop, bb))
10836             {
10837               if (gimple_debug_bind_p (ustmt))
10838                 {
10839                   if (dump_enabled_p ())
10840                     dump_printf_loc (MSG_NOTE, vect_location,
10841                                      "killing debug use\n");
10842
10843                   gimple_debug_bind_reset_value (ustmt);
10844                   update_stmt (ustmt);
10845                 }
10846               else
10847                 gcc_unreachable ();
10848             }
10849         }
10850     }
10851 }
10852
10853 /* Given loop represented by LOOP_VINFO, return true if computation of
10854    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
10855    otherwise.  */
10856
10857 static bool
10858 loop_niters_no_overflow (loop_vec_info loop_vinfo)
10859 {
10860   /* Constant case.  */
10861   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
10862     {
10863       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
10864       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
10865
10866       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
10867       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
10868       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
10869         return true;
10870     }
10871
10872   widest_int max;
10873   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10874   /* Check the upper bound of loop niters.  */
10875   if (get_max_loop_iterations (loop, &max))
10876     {
10877       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
10878       signop sgn = TYPE_SIGN (type);
10879       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
10880       if (max < type_max)
10881         return true;
10882     }
10883   return false;
10884 }
10885
10886 /* Return a mask type with half the number of elements as OLD_TYPE,
10887    given that it should have mode NEW_MODE.  */
10888
10889 tree
10890 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
10891 {
10892   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
10893   return build_truth_vector_type_for_mode (nunits, new_mode);
10894 }
10895
10896 /* Return a mask type with twice as many elements as OLD_TYPE,
10897    given that it should have mode NEW_MODE.  */
10898
10899 tree
10900 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
10901 {
10902   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
10903   return build_truth_vector_type_for_mode (nunits, new_mode);
10904 }
10905
10906 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
10907    contain a sequence of NVECTORS masks that each control a vector of type
10908    VECTYPE.  If SCALAR_MASK is nonnull, the fully-masked loop would AND
10909    these vector masks with the vector version of SCALAR_MASK.  */
10910
10911 void
10912 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
10913                        unsigned int nvectors, tree vectype, tree scalar_mask)
10914 {
10915   gcc_assert (nvectors != 0);
10916
10917   if (scalar_mask)
10918     {
10919       scalar_cond_masked_key cond (scalar_mask, nvectors);
10920       loop_vinfo->scalar_cond_masked_set.add (cond);
10921     }
10922
10923   masks->mask_set.add (std::make_pair (vectype, nvectors));
10924 }
10925
10926 /* Given a complete set of masks MASKS, extract mask number INDEX
10927    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
10928    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
10929
10930    See the comment above vec_loop_masks for more details about the mask
10931    arrangement.  */
10932
10933 tree
10934 vect_get_loop_mask (loop_vec_info loop_vinfo,
10935                     gimple_stmt_iterator *gsi, vec_loop_masks *masks,
10936                     unsigned int nvectors, tree vectype, unsigned int index)
10937 {
10938   if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
10939       == vect_partial_vectors_while_ult)
10940     {
10941       rgroup_controls *rgm = &(masks->rgc_vec)[nvectors - 1];
10942       tree mask_type = rgm->type;
10943
10944       /* Populate the rgroup's mask array, if this is the first time we've
10945          used it.  */
10946       if (rgm->controls.is_empty ())
10947         {
10948           rgm->controls.safe_grow_cleared (nvectors, true);
10949           for (unsigned int i = 0; i < nvectors; ++i)
10950             {
10951               tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
10952               /* Provide a dummy definition until the real one is available.  */
10953               SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
10954               rgm->controls[i] = mask;
10955             }
10956         }
10957
10958       tree mask = rgm->controls[index];
10959       if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
10960                     TYPE_VECTOR_SUBPARTS (vectype)))
10961         {
10962           /* A loop mask for data type X can be reused for data type Y
10963              if X has N times more elements than Y and if Y's elements
10964              are N times bigger than X's.  In this case each sequence
10965              of N elements in the loop mask will be all-zero or all-one.
10966              We can then view-convert the mask so that each sequence of
10967              N elements is replaced by a single element.  */
10968           gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
10969                                   TYPE_VECTOR_SUBPARTS (vectype)));
10970           gimple_seq seq = NULL;
10971           mask_type = truth_type_for (vectype);
10972           mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
10973           if (seq)
10974             gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10975         }
10976       return mask;
10977     }
10978   else if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
10979            == vect_partial_vectors_avx512)
10980     {
10981       /* The number of scalars per iteration and the number of vectors are
10982          both compile-time constants.  */
10983       unsigned int nscalars_per_iter
10984         = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10985                      LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
10986
10987       rgroup_controls *rgm = &masks->rgc_vec[nscalars_per_iter - 1];
10988
10989       /* The stored nV is dependent on the mask type produced.  */
10990       gcc_assert (exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10991                              TYPE_VECTOR_SUBPARTS (rgm->type)).to_constant ()
10992                   == rgm->factor);
10993       nvectors = rgm->factor;
10994
10995       /* Populate the rgroup's mask array, if this is the first time we've
10996          used it.  */
10997       if (rgm->controls.is_empty ())
10998         {
10999           rgm->controls.safe_grow_cleared (nvectors, true);
11000           for (unsigned int i = 0; i < nvectors; ++i)
11001             {
11002               tree mask = make_temp_ssa_name (rgm->type, NULL, "loop_mask");
11003               /* Provide a dummy definition until the real one is available.  */
11004               SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
11005               rgm->controls[i] = mask;
11006             }
11007         }
11008       if (known_eq (TYPE_VECTOR_SUBPARTS (rgm->type),
11009                     TYPE_VECTOR_SUBPARTS (vectype)))
11010         return rgm->controls[index];
11011
11012       /* Split the vector if needed.  Since we are dealing with integer mode
11013          masks with AVX512 we can operate on the integer representation
11014          performing the whole vector shifting.  */
11015       unsigned HOST_WIDE_INT factor;
11016       bool ok = constant_multiple_p (TYPE_VECTOR_SUBPARTS (rgm->type),
11017                                      TYPE_VECTOR_SUBPARTS (vectype), &factor);
11018       gcc_assert (ok);
11019       gcc_assert (GET_MODE_CLASS (TYPE_MODE (rgm->type)) == MODE_INT);
11020       tree mask_type = truth_type_for (vectype);
11021       gcc_assert (GET_MODE_CLASS (TYPE_MODE (mask_type)) == MODE_INT);
11022       unsigned vi = index / factor;
11023       unsigned vpart = index % factor;
11024       tree vec = rgm->controls[vi];
11025       gimple_seq seq = NULL;
11026       vec = gimple_build (&seq, VIEW_CONVERT_EXPR,
11027                           lang_hooks.types.type_for_mode
11028                                 (TYPE_MODE (rgm->type), 1), vec);
11029       /* For integer mode masks simply shift the right bits into position.  */
11030       if (vpart != 0)
11031         vec = gimple_build (&seq, RSHIFT_EXPR, TREE_TYPE (vec), vec,
11032                             build_int_cst (integer_type_node,
11033                                            (TYPE_VECTOR_SUBPARTS (vectype)
11034                                             * vpart)));
11035       vec = gimple_convert (&seq, lang_hooks.types.type_for_mode
11036                                     (TYPE_MODE (mask_type), 1), vec);
11037       vec = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, vec);
11038       if (seq)
11039         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11040       return vec;
11041     }
11042   else
11043     gcc_unreachable ();
11044 }
11045
11046 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
11047    lengths for controlling an operation on VECTYPE.  The operation splits
11048    each element of VECTYPE into FACTOR separate subelements, measuring the
11049    length as a number of these subelements.  */
11050
11051 void
11052 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
11053                       unsigned int nvectors, tree vectype, unsigned int factor)
11054 {
11055   gcc_assert (nvectors != 0);
11056   if (lens->length () < nvectors)
11057     lens->safe_grow_cleared (nvectors, true);
11058   rgroup_controls *rgl = &(*lens)[nvectors - 1];
11059
11060   /* The number of scalars per iteration, scalar occupied bytes and
11061      the number of vectors are both compile-time constants.  */
11062   unsigned int nscalars_per_iter
11063     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11064                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
11065
11066   if (rgl->max_nscalars_per_iter < nscalars_per_iter)
11067     {
11068       /* For now, we only support cases in which all loads and stores fall back
11069          to VnQI or none do.  */
11070       gcc_assert (!rgl->max_nscalars_per_iter
11071                   || (rgl->factor == 1 && factor == 1)
11072                   || (rgl->max_nscalars_per_iter * rgl->factor
11073                       == nscalars_per_iter * factor));
11074       rgl->max_nscalars_per_iter = nscalars_per_iter;
11075       rgl->type = vectype;
11076       rgl->factor = factor;
11077     }
11078 }
11079
11080 /* Given a complete set of lengths LENS, extract length number INDEX
11081    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
11082    where 0 <= INDEX < NVECTORS.  Return a value that contains FACTOR
11083    multipled by the number of elements that should be processed.
11084    Insert any set-up statements before GSI.  */
11085
11086 tree
11087 vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
11088                    vec_loop_lens *lens, unsigned int nvectors, tree vectype,
11089                    unsigned int index, unsigned int factor)
11090 {
11091   rgroup_controls *rgl = &(*lens)[nvectors - 1];
11092   bool use_bias_adjusted_len =
11093     LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
11094
11095   /* Populate the rgroup's len array, if this is the first time we've
11096      used it.  */
11097   if (rgl->controls.is_empty ())
11098     {
11099       rgl->controls.safe_grow_cleared (nvectors, true);
11100       for (unsigned int i = 0; i < nvectors; ++i)
11101         {
11102           tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
11103           gcc_assert (len_type != NULL_TREE);
11104
11105           tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
11106
11107           /* Provide a dummy definition until the real one is available.  */
11108           SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
11109           rgl->controls[i] = len;
11110
11111           if (use_bias_adjusted_len)
11112             {
11113               gcc_assert (i == 0);
11114               tree adjusted_len =
11115                 make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
11116               SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
11117               rgl->bias_adjusted_ctrl = adjusted_len;
11118             }
11119         }
11120     }
11121
11122   if (use_bias_adjusted_len)
11123     return rgl->bias_adjusted_ctrl;
11124
11125   tree loop_len = rgl->controls[index];
11126   if (rgl->factor == 1 && factor == 1)
11127     {
11128       poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
11129       poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
11130       if (maybe_ne (nunits1, nunits2))
11131         {
11132           /* A loop len for data type X can be reused for data type Y
11133              if X has N times more elements than Y and if Y's elements
11134              are N times bigger than X's.  */
11135           gcc_assert (multiple_p (nunits1, nunits2));
11136           factor = exact_div (nunits1, nunits2).to_constant ();
11137           tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
11138           gimple_seq seq = NULL;
11139           loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
11140                                    build_int_cst (iv_type, factor));
11141           if (seq)
11142             gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11143         }
11144     }
11145   return loop_len;
11146 }
11147
11148 /* Scale profiling counters by estimation for LOOP which is vectorized
11149    by factor VF.
11150    If FLAT is true, the loop we started with had unrealistically flat
11151    profile.  */
11152
11153 static void
11154 scale_profile_for_vect_loop (class loop *loop, edge exit_e, unsigned vf, bool flat)
11155 {
11156   /* For flat profiles do not scale down proportionally by VF and only
11157      cap by known iteration count bounds.  */
11158   if (flat)
11159     {
11160       if (dump_file && (dump_flags & TDF_DETAILS))
11161         fprintf (dump_file,
11162                  "Vectorized loop profile seems flat; not scaling iteration "
11163                  "count down by the vectorization factor %i\n", vf);
11164       scale_loop_profile (loop, profile_probability::always (),
11165                           get_likely_max_loop_iterations_int (loop));
11166       return;
11167     }
11168   /* Loop body executes VF fewer times and exit increases VF times.  */
11169   profile_count entry_count = loop_preheader_edge (loop)->count ();
11170
11171   /* If we have unreliable loop profile avoid dropping entry
11172      count bellow header count.  This can happen since loops
11173      has unrealistically low trip counts.  */
11174   while (vf > 1
11175          && loop->header->count > entry_count
11176          && loop->header->count < entry_count * vf)
11177     {
11178       if (dump_file && (dump_flags & TDF_DETAILS))
11179         fprintf (dump_file,
11180                  "Vectorization factor %i seems too large for profile "
11181                  "prevoiusly believed to be consistent; reducing.\n", vf);
11182       vf /= 2;
11183     }
11184
11185   if (entry_count.nonzero_p ())
11186     set_edge_probability_and_rescale_others
11187             (exit_e,
11188              entry_count.probability_in (loop->header->count / vf));
11189   /* Avoid producing very large exit probability when we do not have
11190      sensible profile.  */
11191   else if (exit_e->probability < profile_probability::always () / (vf * 2))
11192     set_edge_probability_and_rescale_others (exit_e, exit_e->probability * vf);
11193   loop->latch->count = single_pred_edge (loop->latch)->count ();
11194
11195   scale_loop_profile (loop, profile_probability::always () / vf,
11196                       get_likely_max_loop_iterations_int (loop));
11197 }
11198
11199 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
11200    latch edge values originally defined by it.  */
11201
11202 static void
11203 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
11204                                      stmt_vec_info def_stmt_info)
11205 {
11206   tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
11207   if (!def || TREE_CODE (def) != SSA_NAME)
11208     return;
11209   stmt_vec_info phi_info;
11210   imm_use_iterator iter;
11211   use_operand_p use_p;
11212   FOR_EACH_IMM_USE_FAST (use_p, iter, def)
11213     {
11214       gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p));
11215       if (!phi)
11216         continue;
11217       if (!(gimple_bb (phi)->loop_father->header == gimple_bb (phi)
11218             && (phi_info = loop_vinfo->lookup_stmt (phi))
11219             && STMT_VINFO_RELEVANT_P (phi_info)))
11220         continue;
11221       loop_p loop = gimple_bb (phi)->loop_father;
11222       edge e = loop_latch_edge (loop);
11223       if (PHI_ARG_DEF_FROM_EDGE (phi, e) != def)
11224         continue;
11225
11226       if (VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
11227           && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
11228           && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
11229         {
11230           vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11231           vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11232           gcc_assert (phi_defs.length () == latch_defs.length ());
11233           for (unsigned i = 0; i < phi_defs.length (); ++i)
11234             add_phi_arg (as_a <gphi *> (phi_defs[i]),
11235                          gimple_get_lhs (latch_defs[i]), e,
11236                          gimple_phi_arg_location (phi, e->dest_idx));
11237         }
11238       else if (STMT_VINFO_DEF_TYPE (phi_info) == vect_first_order_recurrence)
11239         {
11240           /* For first order recurrences we have to update both uses of
11241              the latch definition, the one in the PHI node and the one
11242              in the generated VEC_PERM_EXPR.  */
11243           vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11244           vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11245           gcc_assert (phi_defs.length () == latch_defs.length ());
11246           tree phidef = gimple_assign_rhs1 (phi_defs[0]);
11247           gphi *vphi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
11248           for (unsigned i = 0; i < phi_defs.length (); ++i)
11249             {
11250               gassign *perm = as_a <gassign *> (phi_defs[i]);
11251               if (i > 0)
11252                 gimple_assign_set_rhs1 (perm, gimple_get_lhs (latch_defs[i-1]));
11253               gimple_assign_set_rhs2 (perm, gimple_get_lhs (latch_defs[i]));
11254               update_stmt (perm);
11255             }
11256           add_phi_arg (vphi, gimple_get_lhs (latch_defs.last ()), e,
11257                        gimple_phi_arg_location (phi, e->dest_idx));
11258         }
11259     }
11260 }
11261
11262 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
11263    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
11264    stmt_vec_info.  */
11265
11266 static bool
11267 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
11268                           gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
11269 {
11270   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11271   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11272
11273   if (dump_enabled_p ())
11274     dump_printf_loc (MSG_NOTE, vect_location,
11275                      "------>vectorizing statement: %G", stmt_info->stmt);
11276
11277   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
11278     vect_loop_kill_debug_uses (loop, stmt_info);
11279
11280   if (!STMT_VINFO_RELEVANT_P (stmt_info)
11281       && !STMT_VINFO_LIVE_P (stmt_info))
11282     return false;
11283
11284   if (STMT_VINFO_VECTYPE (stmt_info))
11285     {
11286       poly_uint64 nunits
11287         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
11288       if (!STMT_SLP_TYPE (stmt_info)
11289           && maybe_ne (nunits, vf)
11290           && dump_enabled_p ())
11291         /* For SLP VF is set according to unrolling factor, and not
11292            to vector size, hence for SLP this print is not valid.  */
11293         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
11294     }
11295
11296   /* Pure SLP statements have already been vectorized.  We still need
11297      to apply loop vectorization to hybrid SLP statements.  */
11298   if (PURE_SLP_STMT (stmt_info))
11299     return false;
11300
11301   if (dump_enabled_p ())
11302     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
11303
11304   if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
11305     *seen_store = stmt_info;
11306
11307   return true;
11308 }
11309
11310 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
11311    in the hash_map with its corresponding values.  */
11312
11313 static tree
11314 find_in_mapping (tree t, void *context)
11315 {
11316   hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
11317
11318   tree *value = mapping->get (t);
11319   return value ? *value : t;
11320 }
11321
11322 /* Update EPILOGUE's loop_vec_info.  EPILOGUE was constructed as a copy of the
11323    original loop that has now been vectorized.
11324
11325    The inits of the data_references need to be advanced with the number of
11326    iterations of the main loop.  This has been computed in vect_do_peeling and
11327    is stored in parameter ADVANCE.  We first restore the data_references
11328    initial offset with the values recored in ORIG_DRS_INIT.
11329
11330    Since the loop_vec_info of this EPILOGUE was constructed for the original
11331    loop, its stmt_vec_infos all point to the original statements.  These need
11332    to be updated to point to their corresponding copies as well as the SSA_NAMES
11333    in their PATTERN_DEF_SEQs and RELATED_STMTs.
11334
11335    The data_reference's connections also need to be updated.  Their
11336    corresponding dr_vec_info need to be reconnected to the EPILOGUE's
11337    stmt_vec_infos, their statements need to point to their corresponding copy,
11338    if they are gather loads or scatter stores then their reference needs to be
11339    updated to point to its corresponding copy and finally we set
11340    'base_misaligned' to false as we have already peeled for alignment in the
11341    prologue of the main loop.  */
11342
11343 static void
11344 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
11345 {
11346   loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
11347   auto_vec<gimple *> stmt_worklist;
11348   hash_map<tree,tree> mapping;
11349   gimple *orig_stmt, *new_stmt;
11350   gimple_stmt_iterator epilogue_gsi;
11351   gphi_iterator epilogue_phi_gsi;
11352   stmt_vec_info stmt_vinfo = NULL, related_vinfo;
11353   basic_block *epilogue_bbs = get_loop_body (epilogue);
11354   unsigned i;
11355
11356   free (LOOP_VINFO_BBS (epilogue_vinfo));
11357   LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
11358
11359   /* Advance data_reference's with the number of iterations of the previous
11360      loop and its prologue.  */
11361   vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
11362
11363
11364   /* The EPILOGUE loop is a copy of the original loop so they share the same
11365      gimple UIDs.  In this loop we update the loop_vec_info of the EPILOGUE to
11366      point to the copied statements.  We also create a mapping of all LHS' in
11367      the original loop and all the LHS' in the EPILOGUE and create worklists to
11368      update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs.  */
11369   for (unsigned i = 0; i < epilogue->num_nodes; ++i)
11370     {
11371       for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
11372            !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
11373         {
11374           new_stmt = epilogue_phi_gsi.phi ();
11375
11376           gcc_assert (gimple_uid (new_stmt) > 0);
11377           stmt_vinfo
11378             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11379
11380           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11381           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11382
11383           mapping.put (gimple_phi_result (orig_stmt),
11384                        gimple_phi_result (new_stmt));
11385           /* PHI nodes can not have patterns or related statements.  */
11386           gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
11387                       && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
11388         }
11389
11390       for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
11391            !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
11392         {
11393           new_stmt = gsi_stmt (epilogue_gsi);
11394           if (is_gimple_debug (new_stmt))
11395             continue;
11396
11397           gcc_assert (gimple_uid (new_stmt) > 0);
11398           stmt_vinfo
11399             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11400
11401           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11402           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11403
11404           if (tree old_lhs = gimple_get_lhs (orig_stmt))
11405             mapping.put (old_lhs, gimple_get_lhs (new_stmt));
11406
11407           if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
11408             {
11409               gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
11410               for (gimple_stmt_iterator gsi = gsi_start (seq);
11411                    !gsi_end_p (gsi); gsi_next (&gsi))
11412                 stmt_worklist.safe_push (gsi_stmt (gsi));
11413             }
11414
11415           related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
11416           if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
11417             {
11418               gimple *stmt = STMT_VINFO_STMT (related_vinfo);
11419               stmt_worklist.safe_push (stmt);
11420               /* Set BB such that the assert in
11421                 'get_initial_def_for_reduction' is able to determine that
11422                 the BB of the related stmt is inside this loop.  */
11423               gimple_set_bb (stmt,
11424                              gimple_bb (new_stmt));
11425               related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
11426               gcc_assert (related_vinfo == NULL
11427                           || related_vinfo == stmt_vinfo);
11428             }
11429         }
11430     }
11431
11432   /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
11433      using the original main loop and thus need to be updated to refer to the
11434      cloned variables used in the epilogue.  */
11435   for (unsigned i = 0; i < stmt_worklist.length (); ++i)
11436     {
11437       gimple *stmt = stmt_worklist[i];
11438       tree *new_op;
11439
11440       for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
11441         {
11442           tree op = gimple_op (stmt, j);
11443           if ((new_op = mapping.get(op)))
11444             gimple_set_op (stmt, j, *new_op);
11445           else
11446             {
11447               /* PR92429: The last argument of simplify_replace_tree disables
11448                  folding when replacing arguments.  This is required as
11449                  otherwise you might end up with different statements than the
11450                  ones analyzed in vect_loop_analyze, leading to different
11451                  vectorization.  */
11452               op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
11453                                           &find_in_mapping, &mapping, false);
11454               gimple_set_op (stmt, j, op);
11455             }
11456         }
11457     }
11458
11459   struct data_reference *dr;
11460   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
11461   FOR_EACH_VEC_ELT (datarefs, i, dr)
11462     {
11463       orig_stmt = DR_STMT (dr);
11464       gcc_assert (gimple_uid (orig_stmt) > 0);
11465       stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
11466       /* Data references for gather loads and scatter stores do not use the
11467          updated offset we set using ADVANCE.  Instead we have to make sure the
11468          reference in the data references point to the corresponding copy of
11469          the original in the epilogue.  Make sure to update both
11470          gather/scatters recognized by dataref analysis and also other
11471          refs that get_load_store_type classified as VMAT_GATHER_SCATTER.  */
11472       auto vstmt_vinfo = vect_stmt_to_vectorize (stmt_vinfo);
11473       if (STMT_VINFO_MEMORY_ACCESS_TYPE (vstmt_vinfo) == VMAT_GATHER_SCATTER
11474           || STMT_VINFO_GATHER_SCATTER_P (vstmt_vinfo))
11475         {
11476           DR_REF (dr)
11477             = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
11478                                      &find_in_mapping, &mapping);
11479           DR_BASE_ADDRESS (dr)
11480             = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
11481                                      &find_in_mapping, &mapping);
11482         }
11483       DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
11484       stmt_vinfo->dr_aux.stmt = stmt_vinfo;
11485       /* The vector size of the epilogue is smaller than that of the main loop
11486          so the alignment is either the same or lower. This means the dr will
11487          thus by definition be aligned.  */
11488       STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
11489     }
11490
11491   epilogue_vinfo->shared->datarefs_copy.release ();
11492   epilogue_vinfo->shared->save_datarefs ();
11493 }
11494
11495 /* Function vect_transform_loop.
11496
11497    The analysis phase has determined that the loop is vectorizable.
11498    Vectorize the loop - created vectorized stmts to replace the scalar
11499    stmts in the loop, and update the loop exit condition.
11500    Returns scalar epilogue loop if any.  */
11501
11502 class loop *
11503 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
11504 {
11505   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11506   class loop *epilogue = NULL;
11507   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
11508   int nbbs = loop->num_nodes;
11509   int i;
11510   tree niters_vector = NULL_TREE;
11511   tree step_vector = NULL_TREE;
11512   tree niters_vector_mult_vf = NULL_TREE;
11513   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11514   unsigned int lowest_vf = constant_lower_bound (vf);
11515   gimple *stmt;
11516   bool check_profitability = false;
11517   unsigned int th;
11518   bool flat = maybe_flat_loop_profile (loop);
11519
11520   DUMP_VECT_SCOPE ("vec_transform_loop");
11521
11522   loop_vinfo->shared->check_datarefs ();
11523
11524   /* Use the more conservative vectorization threshold.  If the number
11525      of iterations is constant assume the cost check has been performed
11526      by our caller.  If the threshold makes all loops profitable that
11527      run at least the (estimated) vectorization factor number of times
11528      checking is pointless, too.  */
11529   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
11530   if (vect_apply_runtime_profitability_check_p (loop_vinfo))
11531     {
11532       if (dump_enabled_p ())
11533         dump_printf_loc (MSG_NOTE, vect_location,
11534                          "Profitability threshold is %d loop iterations.\n",
11535                          th);
11536       check_profitability = true;
11537     }
11538
11539   /* Make sure there exists a single-predecessor exit bb.  Do this before
11540      versioning.   */
11541   edge e = LOOP_VINFO_IV_EXIT (loop_vinfo);
11542   if (! single_pred_p (e->dest))
11543     {
11544       split_loop_exit_edge (e, true);
11545       if (dump_enabled_p ())
11546         dump_printf (MSG_NOTE, "split exit edge\n");
11547     }
11548
11549   /* Version the loop first, if required, so the profitability check
11550      comes first.  */
11551
11552   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
11553     {
11554       class loop *sloop
11555         = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
11556       sloop->force_vectorize = false;
11557       check_profitability = false;
11558     }
11559
11560   /* Make sure there exists a single-predecessor exit bb also on the
11561      scalar loop copy.  Do this after versioning but before peeling
11562      so CFG structure is fine for both scalar and if-converted loop
11563      to make slpeel_duplicate_current_defs_from_edges face matched
11564      loop closed PHI nodes on the exit.  */
11565   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
11566     {
11567       e = LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo);
11568       if (! single_pred_p (e->dest))
11569         {
11570           split_loop_exit_edge (e, true);
11571           if (dump_enabled_p ())
11572             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
11573         }
11574     }
11575
11576   tree niters = vect_build_loop_niters (loop_vinfo);
11577   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
11578   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
11579   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
11580   tree advance;
11581   drs_init_vec orig_drs_init;
11582
11583   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
11584                               &step_vector, &niters_vector_mult_vf, th,
11585                               check_profitability, niters_no_overflow,
11586                               &advance);
11587   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
11588       && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
11589     {
11590       /* Ifcvt duplicates loop preheader, loop body and produces an basic
11591          block after loop exit.  We need to scale all that.  */
11592       basic_block preheader
11593         = loop_preheader_edge (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->src;
11594       preheader->count
11595         = preheader->count.apply_probability
11596               (LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11597       scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
11598                               LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11599       single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->dest->count
11600         = preheader->count;
11601     }
11602
11603   if (niters_vector == NULL_TREE)
11604     {
11605       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
11606           && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
11607           && known_eq (lowest_vf, vf))
11608         {
11609           niters_vector
11610             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
11611                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
11612           step_vector = build_one_cst (TREE_TYPE (niters));
11613         }
11614       else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11615         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
11616                                      &step_vector, niters_no_overflow);
11617       else
11618         /* vect_do_peeling subtracted the number of peeled prologue
11619            iterations from LOOP_VINFO_NITERS.  */
11620         vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
11621                                      &niters_vector, &step_vector,
11622                                      niters_no_overflow);
11623     }
11624
11625   /* 1) Make sure the loop header has exactly two entries
11626      2) Make sure we have a preheader basic block.  */
11627
11628   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
11629
11630   split_edge (loop_preheader_edge (loop));
11631
11632   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11633     /* This will deal with any possible peeling.  */
11634     vect_prepare_for_masked_peels (loop_vinfo);
11635
11636   /* Schedule the SLP instances first, then handle loop vectorization
11637      below.  */
11638   if (!loop_vinfo->slp_instances.is_empty ())
11639     {
11640       DUMP_VECT_SCOPE ("scheduling SLP instances");
11641       vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
11642     }
11643
11644   /* FORNOW: the vectorizer supports only loops which body consist
11645      of one basic block (header + empty latch). When the vectorizer will
11646      support more involved loop forms, the order by which the BBs are
11647      traversed need to be reconsidered.  */
11648
11649   for (i = 0; i < nbbs; i++)
11650     {
11651       basic_block bb = bbs[i];
11652       stmt_vec_info stmt_info;
11653
11654       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
11655            gsi_next (&si))
11656         {
11657           gphi *phi = si.phi ();
11658           if (dump_enabled_p ())
11659             dump_printf_loc (MSG_NOTE, vect_location,
11660                              "------>vectorizing phi: %G", (gimple *) phi);
11661           stmt_info = loop_vinfo->lookup_stmt (phi);
11662           if (!stmt_info)
11663             continue;
11664
11665           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
11666             vect_loop_kill_debug_uses (loop, stmt_info);
11667
11668           if (!STMT_VINFO_RELEVANT_P (stmt_info)
11669               && !STMT_VINFO_LIVE_P (stmt_info))
11670             continue;
11671
11672           if (STMT_VINFO_VECTYPE (stmt_info)
11673               && (maybe_ne
11674                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
11675               && dump_enabled_p ())
11676             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
11677
11678           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
11679                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
11680                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
11681                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
11682                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence
11683                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
11684               && ! PURE_SLP_STMT (stmt_info))
11685             {
11686               if (dump_enabled_p ())
11687                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
11688               vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
11689             }
11690         }
11691
11692       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
11693            gsi_next (&si))
11694         {
11695           gphi *phi = si.phi ();
11696           stmt_info = loop_vinfo->lookup_stmt (phi);
11697           if (!stmt_info)
11698             continue;
11699
11700           if (!STMT_VINFO_RELEVANT_P (stmt_info)
11701               && !STMT_VINFO_LIVE_P (stmt_info))
11702             continue;
11703
11704           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
11705                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
11706                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
11707                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
11708                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
11709                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence)
11710               && ! PURE_SLP_STMT (stmt_info))
11711             maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
11712         }
11713
11714       for (gimple_stmt_iterator si = gsi_start_bb (bb);
11715            !gsi_end_p (si);)
11716         {
11717           stmt = gsi_stmt (si);
11718           /* During vectorization remove existing clobber stmts.  */
11719           if (gimple_clobber_p (stmt))
11720             {
11721               unlink_stmt_vdef (stmt);
11722               gsi_remove (&si, true);
11723               release_defs (stmt);
11724             }
11725           else
11726             {
11727               /* Ignore vector stmts created in the outer loop.  */
11728               stmt_info = loop_vinfo->lookup_stmt (stmt);
11729
11730               /* vector stmts created in the outer-loop during vectorization of
11731                  stmts in an inner-loop may not have a stmt_info, and do not
11732                  need to be vectorized.  */
11733               stmt_vec_info seen_store = NULL;
11734               if (stmt_info)
11735                 {
11736                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
11737                     {
11738                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
11739                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
11740                            !gsi_end_p (subsi); gsi_next (&subsi))
11741                         {
11742                           stmt_vec_info pat_stmt_info
11743                             = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
11744                           vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
11745                                                     &si, &seen_store);
11746                         }
11747                       stmt_vec_info pat_stmt_info
11748                         = STMT_VINFO_RELATED_STMT (stmt_info);
11749                       if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
11750                                                     &si, &seen_store))
11751                         maybe_set_vectorized_backedge_value (loop_vinfo,
11752                                                              pat_stmt_info);
11753                     }
11754                   else
11755                     {
11756                       if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
11757                                                     &seen_store))
11758                         maybe_set_vectorized_backedge_value (loop_vinfo,
11759                                                              stmt_info);
11760                     }
11761                 }
11762               gsi_next (&si);
11763               if (seen_store)
11764                 {
11765                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
11766                     /* Interleaving.  If IS_STORE is TRUE, the
11767                        vectorization of the interleaving chain was
11768                        completed - free all the stores in the chain.  */
11769                     vect_remove_stores (loop_vinfo,
11770                                         DR_GROUP_FIRST_ELEMENT (seen_store));
11771                   else
11772                     /* Free the attached stmt_vec_info and remove the stmt.  */
11773                     loop_vinfo->remove_stmt (stmt_info);
11774                 }
11775             }
11776         }
11777
11778       /* Stub out scalar statements that must not survive vectorization.
11779          Doing this here helps with grouped statements, or statements that
11780          are involved in patterns.  */
11781       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
11782            !gsi_end_p (gsi); gsi_next (&gsi))
11783         {
11784           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
11785           if (!call || !gimple_call_internal_p (call))
11786             continue;
11787           internal_fn ifn = gimple_call_internal_fn (call);
11788           if (ifn == IFN_MASK_LOAD)
11789             {
11790               tree lhs = gimple_get_lhs (call);
11791               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11792                 {
11793                   tree zero = build_zero_cst (TREE_TYPE (lhs));
11794                   gimple *new_stmt = gimple_build_assign (lhs, zero);
11795                   gsi_replace (&gsi, new_stmt, true);
11796                 }
11797             }
11798           else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
11799             {
11800               tree lhs = gimple_get_lhs (call);
11801               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11802                 {
11803                   tree else_arg
11804                     = gimple_call_arg (call, gimple_call_num_args (call) - 1);
11805                   gimple *new_stmt = gimple_build_assign (lhs, else_arg);
11806                   gsi_replace (&gsi, new_stmt, true);
11807                 }
11808             }
11809         }
11810     }                           /* BBs in loop */
11811
11812   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
11813      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
11814   if (integer_onep (step_vector))
11815     niters_no_overflow = true;
11816   vect_set_loop_condition (loop, LOOP_VINFO_IV_EXIT (loop_vinfo), loop_vinfo,
11817                            niters_vector, step_vector, niters_vector_mult_vf,
11818                            !niters_no_overflow);
11819
11820   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
11821
11822   /* True if the final iteration might not handle a full vector's
11823      worth of scalar iterations.  */
11824   bool final_iter_may_be_partial
11825     = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
11826   /* The minimum number of iterations performed by the epilogue.  This
11827      is 1 when peeling for gaps because we always need a final scalar
11828      iteration.  */
11829   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
11830   /* +1 to convert latch counts to loop iteration counts,
11831      -min_epilogue_iters to remove iterations that cannot be performed
11832        by the vector code.  */
11833   int bias_for_lowest = 1 - min_epilogue_iters;
11834   int bias_for_assumed = bias_for_lowest;
11835   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
11836   if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
11837     {
11838       /* When the amount of peeling is known at compile time, the first
11839          iteration will have exactly alignment_npeels active elements.
11840          In the worst case it will have at least one.  */
11841       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
11842       bias_for_lowest += lowest_vf - min_first_active;
11843       bias_for_assumed += assumed_vf - min_first_active;
11844     }
11845   /* In these calculations the "- 1" converts loop iteration counts
11846      back to latch counts.  */
11847   if (loop->any_upper_bound)
11848     {
11849       loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
11850       loop->nb_iterations_upper_bound
11851         = (final_iter_may_be_partial
11852            ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
11853                             lowest_vf) - 1
11854            : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
11855                              lowest_vf) - 1);
11856       if (main_vinfo
11857           /* Both peeling for alignment and peeling for gaps can end up
11858              with the scalar epilogue running for more than VF-1 iterations.  */
11859           && !main_vinfo->peeling_for_alignment
11860           && !main_vinfo->peeling_for_gaps)
11861         {
11862           unsigned int bound;
11863           poly_uint64 main_iters
11864             = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
11865                            LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
11866           main_iters
11867             = upper_bound (main_iters,
11868                            LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
11869           if (can_div_away_from_zero_p (main_iters,
11870                                         LOOP_VINFO_VECT_FACTOR (loop_vinfo),
11871                                         &bound))
11872             loop->nb_iterations_upper_bound
11873               = wi::umin ((bound_wide_int) (bound - 1),
11874                           loop->nb_iterations_upper_bound);
11875       }
11876   }
11877   if (loop->any_likely_upper_bound)
11878     loop->nb_iterations_likely_upper_bound
11879       = (final_iter_may_be_partial
11880          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
11881                           + bias_for_lowest, lowest_vf) - 1
11882          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
11883                            + bias_for_lowest, lowest_vf) - 1);
11884   if (loop->any_estimate)
11885     loop->nb_iterations_estimate
11886       = (final_iter_may_be_partial
11887          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
11888                           assumed_vf) - 1
11889          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
11890                            assumed_vf) - 1);
11891   scale_profile_for_vect_loop (loop, LOOP_VINFO_IV_EXIT (loop_vinfo),
11892                                assumed_vf, flat);
11893
11894   if (dump_enabled_p ())
11895     {
11896       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
11897         {
11898           dump_printf_loc (MSG_NOTE, vect_location,
11899                            "LOOP VECTORIZED\n");
11900           if (loop->inner)
11901             dump_printf_loc (MSG_NOTE, vect_location,
11902                              "OUTER LOOP VECTORIZED\n");
11903           dump_printf (MSG_NOTE, "\n");
11904         }
11905       else
11906         dump_printf_loc (MSG_NOTE, vect_location,
11907                          "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
11908                          GET_MODE_NAME (loop_vinfo->vector_mode));
11909     }
11910
11911   /* Loops vectorized with a variable factor won't benefit from
11912      unrolling/peeling.  */
11913   if (!vf.is_constant ())
11914     {
11915       loop->unroll = 1;
11916       if (dump_enabled_p ())
11917         dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
11918                          " variable-length vectorization factor\n");
11919     }
11920   /* Free SLP instances here because otherwise stmt reference counting
11921      won't work.  */
11922   slp_instance instance;
11923   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
11924     vect_free_slp_instance (instance);
11925   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
11926   /* Clear-up safelen field since its value is invalid after vectorization
11927      since vectorized loop can have loop-carried dependencies.  */
11928   loop->safelen = 0;
11929
11930   if (epilogue)
11931     {
11932       update_epilogue_loop_vinfo (epilogue, advance);
11933
11934       epilogue->simduid = loop->simduid;
11935       epilogue->force_vectorize = loop->force_vectorize;
11936       epilogue->dont_vectorize = false;
11937     }
11938
11939   return epilogue;
11940 }
11941
11942 /* The code below is trying to perform simple optimization - revert
11943    if-conversion for masked stores, i.e. if the mask of a store is zero
11944    do not perform it and all stored value producers also if possible.
11945    For example,
11946      for (i=0; i<n; i++)
11947        if (c[i])
11948         {
11949           p1[i] += 1;
11950           p2[i] = p3[i] +2;
11951         }
11952    this transformation will produce the following semi-hammock:
11953
11954    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
11955      {
11956        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
11957        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
11958        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
11959        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
11960        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
11961        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
11962      }
11963 */
11964
11965 void
11966 optimize_mask_stores (class loop *loop)
11967 {
11968   basic_block *bbs = get_loop_body (loop);
11969   unsigned nbbs = loop->num_nodes;
11970   unsigned i;
11971   basic_block bb;
11972   class loop *bb_loop;
11973   gimple_stmt_iterator gsi;
11974   gimple *stmt;
11975   auto_vec<gimple *> worklist;
11976   auto_purge_vect_location sentinel;
11977
11978   vect_location = find_loop_location (loop);
11979   /* Pick up all masked stores in loop if any.  */
11980   for (i = 0; i < nbbs; i++)
11981     {
11982       bb = bbs[i];
11983       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
11984            gsi_next (&gsi))
11985         {
11986           stmt = gsi_stmt (gsi);
11987           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
11988             worklist.safe_push (stmt);
11989         }
11990     }
11991
11992   free (bbs);
11993   if (worklist.is_empty ())
11994     return;
11995
11996   /* Loop has masked stores.  */
11997   while (!worklist.is_empty ())
11998     {
11999       gimple *last, *last_store;
12000       edge e, efalse;
12001       tree mask;
12002       basic_block store_bb, join_bb;
12003       gimple_stmt_iterator gsi_to;
12004       tree vdef, new_vdef;
12005       gphi *phi;
12006       tree vectype;
12007       tree zero;
12008
12009       last = worklist.pop ();
12010       mask = gimple_call_arg (last, 2);
12011       bb = gimple_bb (last);
12012       /* Create then_bb and if-then structure in CFG, then_bb belongs to
12013          the same loop as if_bb.  It could be different to LOOP when two
12014          level loop-nest is vectorized and mask_store belongs to the inner
12015          one.  */
12016       e = split_block (bb, last);
12017       bb_loop = bb->loop_father;
12018       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
12019       join_bb = e->dest;
12020       store_bb = create_empty_bb (bb);
12021       add_bb_to_loop (store_bb, bb_loop);
12022       e->flags = EDGE_TRUE_VALUE;
12023       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
12024       /* Put STORE_BB to likely part.  */
12025       efalse->probability = profile_probability::likely ();
12026       e->probability = efalse->probability.invert ();
12027       store_bb->count = efalse->count ();
12028       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
12029       if (dom_info_available_p (CDI_DOMINATORS))
12030         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
12031       if (dump_enabled_p ())
12032         dump_printf_loc (MSG_NOTE, vect_location,
12033                          "Create new block %d to sink mask stores.",
12034                          store_bb->index);
12035       /* Create vector comparison with boolean result.  */
12036       vectype = TREE_TYPE (mask);
12037       zero = build_zero_cst (vectype);
12038       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
12039       gsi = gsi_last_bb (bb);
12040       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
12041       /* Create new PHI node for vdef of the last masked store:
12042          .MEM_2 = VDEF <.MEM_1>
12043          will be converted to
12044          .MEM.3 = VDEF <.MEM_1>
12045          and new PHI node will be created in join bb
12046          .MEM_2 = PHI <.MEM_1, .MEM_3>
12047       */
12048       vdef = gimple_vdef (last);
12049       new_vdef = make_ssa_name (gimple_vop (cfun), last);
12050       gimple_set_vdef (last, new_vdef);
12051       phi = create_phi_node (vdef, join_bb);
12052       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
12053
12054       /* Put all masked stores with the same mask to STORE_BB if possible.  */
12055       while (true)
12056         {
12057           gimple_stmt_iterator gsi_from;
12058           gimple *stmt1 = NULL;
12059
12060           /* Move masked store to STORE_BB.  */
12061           last_store = last;
12062           gsi = gsi_for_stmt (last);
12063           gsi_from = gsi;
12064           /* Shift GSI to the previous stmt for further traversal.  */
12065           gsi_prev (&gsi);
12066           gsi_to = gsi_start_bb (store_bb);
12067           gsi_move_before (&gsi_from, &gsi_to);
12068           /* Setup GSI_TO to the non-empty block start.  */
12069           gsi_to = gsi_start_bb (store_bb);
12070           if (dump_enabled_p ())
12071             dump_printf_loc (MSG_NOTE, vect_location,
12072                              "Move stmt to created bb\n%G", last);
12073           /* Move all stored value producers if possible.  */
12074           while (!gsi_end_p (gsi))
12075             {
12076               tree lhs;
12077               imm_use_iterator imm_iter;
12078               use_operand_p use_p;
12079               bool res;
12080
12081               /* Skip debug statements.  */
12082               if (is_gimple_debug (gsi_stmt (gsi)))
12083                 {
12084                   gsi_prev (&gsi);
12085                   continue;
12086                 }
12087               stmt1 = gsi_stmt (gsi);
12088               /* Do not consider statements writing to memory or having
12089                  volatile operand.  */
12090               if (gimple_vdef (stmt1)
12091                   || gimple_has_volatile_ops (stmt1))
12092                 break;
12093               gsi_from = gsi;
12094               gsi_prev (&gsi);
12095               lhs = gimple_get_lhs (stmt1);
12096               if (!lhs)
12097                 break;
12098
12099               /* LHS of vectorized stmt must be SSA_NAME.  */
12100               if (TREE_CODE (lhs) != SSA_NAME)
12101                 break;
12102
12103               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12104                 {
12105                   /* Remove dead scalar statement.  */
12106                   if (has_zero_uses (lhs))
12107                     {
12108                       gsi_remove (&gsi_from, true);
12109                       continue;
12110                     }
12111                 }
12112
12113               /* Check that LHS does not have uses outside of STORE_BB.  */
12114               res = true;
12115               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
12116                 {
12117                   gimple *use_stmt;
12118                   use_stmt = USE_STMT (use_p);
12119                   if (is_gimple_debug (use_stmt))
12120                     continue;
12121                   if (gimple_bb (use_stmt) != store_bb)
12122                     {
12123                       res = false;
12124                       break;
12125                     }
12126                 }
12127               if (!res)
12128                 break;
12129
12130               if (gimple_vuse (stmt1)
12131                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
12132                 break;
12133
12134               /* Can move STMT1 to STORE_BB.  */
12135               if (dump_enabled_p ())
12136                 dump_printf_loc (MSG_NOTE, vect_location,
12137                                  "Move stmt to created bb\n%G", stmt1);
12138               gsi_move_before (&gsi_from, &gsi_to);
12139               /* Shift GSI_TO for further insertion.  */
12140               gsi_prev (&gsi_to);
12141             }
12142           /* Put other masked stores with the same mask to STORE_BB.  */
12143           if (worklist.is_empty ()
12144               || gimple_call_arg (worklist.last (), 2) != mask
12145               || worklist.last () != stmt1)
12146             break;
12147           last = worklist.pop ();
12148         }
12149       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
12150     }
12151 }
12152
12153 /* Decide whether it is possible to use a zero-based induction variable
12154    when vectorizing LOOP_VINFO with partial vectors.  If it is, return
12155    the value that the induction variable must be able to hold in order
12156    to ensure that the rgroups eventually have no active vector elements.
12157    Return -1 otherwise.  */
12158
12159 widest_int
12160 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
12161 {
12162   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
12163   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
12164   unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
12165
12166   /* Calculate the value that the induction variable must be able
12167      to hit in order to ensure that we end the loop with an all-false mask.
12168      This involves adding the maximum number of inactive trailing scalar
12169      iterations.  */
12170   widest_int iv_limit = -1;
12171   if (max_loop_iterations (loop, &iv_limit))
12172     {
12173       if (niters_skip)
12174         {
12175           /* Add the maximum number of skipped iterations to the
12176              maximum iteration count.  */
12177           if (TREE_CODE (niters_skip) == INTEGER_CST)
12178             iv_limit += wi::to_widest (niters_skip);
12179           else
12180             iv_limit += max_vf - 1;
12181         }
12182       else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
12183         /* Make a conservatively-correct assumption.  */
12184         iv_limit += max_vf - 1;
12185
12186       /* IV_LIMIT is the maximum number of latch iterations, which is also
12187          the maximum in-range IV value.  Round this value down to the previous
12188          vector alignment boundary and then add an extra full iteration.  */
12189       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
12190       iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
12191     }
12192   return iv_limit;
12193 }
12194
12195 /* For the given rgroup_controls RGC, check whether an induction variable
12196    would ever hit a value that produces a set of all-false masks or zero
12197    lengths before wrapping around.  Return true if it's possible to wrap
12198    around before hitting the desirable value, otherwise return false.  */
12199
12200 bool
12201 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
12202 {
12203   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
12204
12205   if (iv_limit == -1)
12206     return true;
12207
12208   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
12209   unsigned int compare_precision = TYPE_PRECISION (compare_type);
12210   unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
12211
12212   if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
12213     return true;
12214
12215   return false;
12216 }