gcc/tree-vect-loop.cc

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2023 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #define INCLUDE_ALGORITHM
  23 #include "config.h"
  24 #include "system.h"
  25 #include "coretypes.h"
  26 #include "backend.h"
  27 #include "target.h"
  28 #include "rtl.h"
  29 #include "tree.h"
  30 #include "gimple.h"
  31 #include "cfghooks.h"
  32 #include "tree-pass.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "memmodel.h"
  36 #include "optabs.h"
  37 #include "diagnostic-core.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "cfganal.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop-niter.h"
  47 #include "tree-ssa-loop.h"
  48 #include "cfgloop.h"
  49 #include "tree-scalar-evolution.h"
  50 #include "tree-vectorizer.h"
  51 #include "gimple-fold.h"
  52 #include "cgraph.h"
  53 #include "tree-cfg.h"
  54 #include "tree-if-conv.h"
  55 #include "internal-fn.h"
  56 #include "tree-vector-builder.h"
  57 #include "vec-perm-indices.h"
  58 #include "tree-eh.h"
  59 #include "case-cfn-macros.h"
  60 #include "langhooks.h"
  61
  62 /* Loop Vectorization Pass.
  63
  64    This pass tries to vectorize loops.
  65
  66    For example, the vectorizer transforms the following simple loop:
  67
  68         short a[N]; short b[N]; short c[N]; int i;
  69
  70         for (i=0; i<N; i++){
  71           a[i] = b[i] + c[i];
  72         }
  73
  74    as if it was manually vectorized by rewriting the source code into:
  75
  76         typedef int __attribute__((mode(V8HI))) v8hi;
  77         short a[N];  short b[N]; short c[N];   int i;
  78         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  79         v8hi va, vb, vc;
  80
  81         for (i=0; i<N/8; i++){
  82           vb = pb[i];
  83           vc = pc[i];
  84           va = vb + vc;
  85           pa[i] = va;
  86         }
  87
  88         The main entry to this pass is vectorize_loops(), in which
  89    the vectorizer applies a set of analyses on a given set of loops,
  90    followed by the actual vectorization transformation for the loops that
  91    had successfully passed the analysis phase.
  92         Throughout this pass we make a distinction between two types of
  93    data: scalars (which are represented by SSA_NAMES), and memory references
  94    ("data-refs").  These two types of data require different handling both
  95    during analysis and transformation. The types of data-refs that the
  96    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  97    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  98    accesses are required to have a simple (consecutive) access pattern.
  99
 100    Analysis phase:
 101    ===============
 102         The driver for the analysis phase is vect_analyze_loop().
 103    It applies a set of analyses, some of which rely on the scalar evolution
 104    analyzer (scev) developed by Sebastian Pop.
 105
 106         During the analysis phase the vectorizer records some information
 107    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 108    loop, as well as general information about the loop as a whole, which is
 109    recorded in a "loop_vec_info" struct attached to each loop.
 110
 111    Transformation phase:
 112    =====================
 113         The loop transformation phase scans all the stmts in the loop, and
 114    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 115    the loop that needs to be vectorized.  It inserts the vector code sequence
 116    just before the scalar stmt S, and records a pointer to the vector code
 117    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 118    attached to S).  This pointer will be used for the vectorization of following
 119    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 120    otherwise, we rely on dead code elimination for removing it.
 121
 122         For example, say stmt S1 was vectorized into stmt VS1:
 123
 124    VS1: vb = px[i];
 125    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 126    S2:  a = b;
 127
 128    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 129    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 130    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 131    resulting sequence would be:
 132
 133    VS1: vb = px[i];
 134    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 135    VS2: va = vb;
 136    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 137
 138         Operands that are not SSA_NAMEs, are data-refs that appear in
 139    load/store operations (like 'x[i]' in S1), and are handled differently.
 140
 141    Target modeling:
 142    =================
 143         Currently the only target specific information that is used is the
 144    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 145    Targets that can support different sizes of vectors, for now will need
 146    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 147    flexibility will be added in the future.
 148
 149         Since we only vectorize operations which vector form can be
 150    expressed using existing tree codes, to verify that an operation is
 151    supported, the vectorizer checks the relevant optab at the relevant
 152    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 153    the value found is CODE_FOR_nothing, then there's no target support, and
 154    we can't vectorize the stmt.
 155
 156    For additional information on this project see:
 157    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 158 */
 159
 160 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
 161                                                 unsigned *);
 162 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
 163                                                bool *, bool *, bool);
 164
 165 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 166    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 167    may already be set for general statements (not just data refs).  */
 168
 169 static opt_result
 170 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
 171                               bool vectype_maybe_set_p,
 172                               poly_uint64 *vf)
 173 {
 174   gimple *stmt = stmt_info->stmt;
 175
 176   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 177        && !STMT_VINFO_LIVE_P (stmt_info))
 178       || gimple_clobber_p (stmt))
 179     {
 180       if (dump_enabled_p ())
 181         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 182       return opt_result::success ();
 183     }
 184
 185   tree stmt_vectype, nunits_vectype;
 186   opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
 187                                                    &stmt_vectype,
 188                                                    &nunits_vectype);
 189   if (!res)
 190     return res;
 191
 192   if (stmt_vectype)
 193     {
 194       if (STMT_VINFO_VECTYPE (stmt_info))
 195         /* The only case when a vectype had been already set is for stmts
 196            that contain a data ref, or for "pattern-stmts" (stmts generated
 197            by the vectorizer to represent/replace a certain idiom).  */
 198         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 199                      || vectype_maybe_set_p)
 200                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 201       else
 202         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 203     }
 204
 205   if (nunits_vectype)
 206     vect_update_max_nunits (vf, nunits_vectype);
 207
 208   return opt_result::success ();
 209 }
 210
 211 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 212    types of STMT_INFO and all attached pattern statements and update
 213    the vectorization factor VF accordingly.  Return true on success
 214    or false if something prevented vectorization.  */
 215
 216 static opt_result
 217 vect_determine_vf_for_stmt (vec_info *vinfo,
 218                             stmt_vec_info stmt_info, poly_uint64 *vf)
 219 {
 220   if (dump_enabled_p ())
 221     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
 222                      stmt_info->stmt);
 223   opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
 224   if (!res)
 225     return res;
 226
 227   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 228       && STMT_VINFO_RELATED_STMT (stmt_info))
 229     {
 230       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 231       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
 232
 233       /* If a pattern statement has def stmts, analyze them too.  */
 234       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 235            !gsi_end_p (si); gsi_next (&si))
 236         {
 237           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 238           if (dump_enabled_p ())
 239             dump_printf_loc (MSG_NOTE, vect_location,
 240                              "==> examining pattern def stmt: %G",
 241                              def_stmt_info->stmt);
 242           res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
 243           if (!res)
 244             return res;
 245         }
 246
 247       if (dump_enabled_p ())
 248         dump_printf_loc (MSG_NOTE, vect_location,
 249                          "==> examining pattern statement: %G",
 250                          stmt_info->stmt);
 251       res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
 252       if (!res)
 253         return res;
 254     }
 255
 256   return opt_result::success ();
 257 }
 258
 259 /* Function vect_determine_vectorization_factor
 260
 261    Determine the vectorization factor (VF).  VF is the number of data elements
 262    that are operated upon in parallel in a single iteration of the vectorized
 263    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 264    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 265    elements can fit in a single vector register.
 266
 267    We currently support vectorization of loops in which all types operated upon
 268    are of the same size.  Therefore this function currently sets VF according to
 269    the size of the types operated upon, and fails if there are multiple sizes
 270    in the loop.
 271
 272    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 273    original loop:
 274         for (i=0; i<N; i++){
 275           a[i] = b[i] + c[i];
 276         }
 277
 278    vectorized loop:
 279         for (i=0; i<N; i+=VF){
 280           a[i:VF] = b[i:VF] + c[i:VF];
 281         }
 282 */
 283
 284 static opt_result
 285 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 286 {
 287   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 288   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 289   unsigned nbbs = loop->num_nodes;
 290   poly_uint64 vectorization_factor = 1;
 291   tree scalar_type = NULL_TREE;
 292   gphi *phi;
 293   tree vectype;
 294   stmt_vec_info stmt_info;
 295   unsigned i;
 296
 297   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 298
 299   for (i = 0; i < nbbs; i++)
 300     {
 301       basic_block bb = bbs[i];
 302
 303       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 304            gsi_next (&si))
 305         {
 306           phi = si.phi ();
 307           stmt_info = loop_vinfo->lookup_stmt (phi);
 308           if (dump_enabled_p ())
 309             dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
 310                              (gimple *) phi);
 311
 312           gcc_assert (stmt_info);
 313
 314           if (STMT_VINFO_RELEVANT_P (stmt_info)
 315               || STMT_VINFO_LIVE_P (stmt_info))
 316             {
 317               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 318               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 319
 320               if (dump_enabled_p ())
 321                 dump_printf_loc (MSG_NOTE, vect_location,
 322                                  "get vectype for scalar type:  %T\n",
 323                                  scalar_type);
 324
 325               vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 326               if (!vectype)
 327                 return opt_result::failure_at (phi,
 328                                                "not vectorized: unsupported "
 329                                                "data-type %T\n",
 330                                                scalar_type);
 331               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 332
 333               if (dump_enabled_p ())
 334                 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
 335                                  vectype);
 336
 337               if (dump_enabled_p ())
 338                 {
 339                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 340                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 341                   dump_printf (MSG_NOTE, "\n");
 342                 }
 343
 344               vect_update_max_nunits (&vectorization_factor, vectype);
 345             }
 346         }
 347
 348       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 349            gsi_next (&si))
 350         {
 351           if (is_gimple_debug (gsi_stmt (si)))
 352             continue;
 353           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 354           opt_result res
 355             = vect_determine_vf_for_stmt (loop_vinfo,
 356                                           stmt_info, &vectorization_factor);
 357           if (!res)
 358             return res;
 359         }
 360     }
 361
 362   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 363   if (dump_enabled_p ())
 364     {
 365       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 366       dump_dec (MSG_NOTE, vectorization_factor);
 367       dump_printf (MSG_NOTE, "\n");
 368     }
 369
 370   if (known_le (vectorization_factor, 1U))
 371     return opt_result::failure_at (vect_location,
 372                                    "not vectorized: unsupported data-type\n");
 373   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 374   return opt_result::success ();
 375 }
 376
 377
 378 /* Function vect_is_simple_iv_evolution.
 379
 380    FORNOW: A simple evolution of an induction variables in the loop is
 381    considered a polynomial evolution.  */
 382
 383 static bool
 384 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 385                              tree * step)
 386 {
 387   tree init_expr;
 388   tree step_expr;
 389   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 390   basic_block bb;
 391
 392   /* When there is no evolution in this loop, the evolution function
 393      is not "simple".  */
 394   if (evolution_part == NULL_TREE)
 395     return false;
 396
 397   /* When the evolution is a polynomial of degree >= 2
 398      the evolution function is not "simple".  */
 399   if (tree_is_chrec (evolution_part))
 400     return false;
 401
 402   step_expr = evolution_part;
 403   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 404
 405   if (dump_enabled_p ())
 406     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
 407                      step_expr, init_expr);
 408
 409   *init = init_expr;
 410   *step = step_expr;
 411
 412   if (TREE_CODE (step_expr) != INTEGER_CST
 413       && (TREE_CODE (step_expr) != SSA_NAME
 414           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 415               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 416           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 417               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 418                   || !flag_associative_math)))
 419       && (TREE_CODE (step_expr) != REAL_CST
 420           || !flag_associative_math))
 421     {
 422       if (dump_enabled_p ())
 423         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 424                          "step unknown.\n");
 425       return false;
 426     }
 427
 428   return true;
 429 }
 430
 431 /* Function vect_is_nonlinear_iv_evolution
 432
 433    Only support nonlinear induction for integer type
 434    1. neg
 435    2. mul by constant
 436    3. lshift/rshift by constant.
 437
 438    For neg induction, return a fake step as integer -1.  */
 439 static bool
 440 vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
 441                                 gphi* loop_phi_node, tree *init, tree *step)
 442 {
 443   tree init_expr, ev_expr, result, op1, op2;
 444   gimple* def;
 445
 446   if (gimple_phi_num_args (loop_phi_node) != 2)
 447     return false;
 448
 449   init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop));
 450   ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop));
 451
 452   /* Support nonlinear induction only for integer type.  */
 453   if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
 454     return false;
 455
 456   *init = init_expr;
 457   result = PHI_RESULT (loop_phi_node);
 458
 459   if (TREE_CODE (ev_expr) != SSA_NAME
 460       || ((def = SSA_NAME_DEF_STMT (ev_expr)), false)
 461       || !is_gimple_assign (def))
 462     return false;
 463
 464   enum tree_code t_code = gimple_assign_rhs_code (def);
 465   switch (t_code)
 466     {
 467     case NEGATE_EXPR:
 468       if (gimple_assign_rhs1 (def) != result)
 469         return false;
 470       *step = build_int_cst (TREE_TYPE (init_expr), -1);
 471       STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
 472       break;
 473
 474     case RSHIFT_EXPR:
 475     case LSHIFT_EXPR:
 476     case MULT_EXPR:
 477       op1 = gimple_assign_rhs1 (def);
 478       op2 = gimple_assign_rhs2 (def);
 479       if (TREE_CODE (op2) != INTEGER_CST
 480           || op1 != result)
 481         return false;
 482       *step = op2;
 483       if (t_code == LSHIFT_EXPR)
 484         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
 485       else if (t_code == RSHIFT_EXPR)
 486         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
 487       /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul.  */
 488       else
 489         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
 490       break;
 491
 492     default:
 493       return false;
 494     }
 495
 496   STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = *init;
 497   STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = *step;
 498
 499   return true;
 500 }
 501
 502 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
 503    what we are assuming is a double reduction.  For example, given
 504    a structure like this:
 505
 506       outer1:
 507         x_1 = PHI <x_4(outer2), ...>;
 508         ...
 509
 510       inner:
 511         x_2 = PHI <x_1(outer1), ...>;
 512         ...
 513         x_3 = ...;
 514         ...
 515
 516       outer2:
 517         x_4 = PHI <x_3(inner)>;
 518         ...
 519
 520    outer loop analysis would treat x_1 as a double reduction phi and
 521    this function would then return true for x_2.  */
 522
 523 static bool
 524 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
 525 {
 526   use_operand_p use_p;
 527   ssa_op_iter op_iter;
 528   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
 529     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
 530       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
 531         return true;
 532   return false;
 533 }
 534
 535 /* Returns true if Phi is a first-order recurrence. A first-order
 536    recurrence is a non-reduction recurrence relation in which the value of
 537    the recurrence in the current loop iteration equals a value defined in
 538    the previous iteration.  */
 539
 540 static bool
 541 vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop,
 542                                    gphi *phi)
 543 {
 544   /* A nested cycle isn't vectorizable as first order recurrence.  */
 545   if (LOOP_VINFO_LOOP (loop_vinfo) != loop)
 546     return false;
 547
 548   /* Ensure the loop latch definition is from within the loop.  */
 549   edge latch = loop_latch_edge (loop);
 550   tree ldef = PHI_ARG_DEF_FROM_EDGE (phi, latch);
 551   if (TREE_CODE (ldef) != SSA_NAME
 552       || SSA_NAME_IS_DEFAULT_DEF (ldef)
 553       || is_a <gphi *> (SSA_NAME_DEF_STMT (ldef))
 554       || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (ldef))))
 555     return false;
 556
 557   tree def = gimple_phi_result (phi);
 558
 559   /* Ensure every use_stmt of the phi node is dominated by the latch
 560      definition.  */
 561   imm_use_iterator imm_iter;
 562   use_operand_p use_p;
 563   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
 564     if (!is_gimple_debug (USE_STMT (use_p))
 565         && (SSA_NAME_DEF_STMT (ldef) == USE_STMT (use_p)
 566             || !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (ldef),
 567                                             USE_STMT (use_p))))
 568       return false;
 569
 570   /* First-order recurrence autovectorization needs shuffle vector.  */
 571   tree scalar_type = TREE_TYPE (def);
 572   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 573   if (!vectype)
 574     return false;
 575
 576   return true;
 577 }
 578
 579 /* Function vect_analyze_scalar_cycles_1.
 580
 581    Examine the cross iteration def-use cycles of scalar variables
 582    in LOOP.  LOOP_VINFO represents the loop that is now being
 583    considered for vectorization (can be LOOP, or an outer-loop
 584    enclosing LOOP).  SLP indicates there will be some subsequent
 585    slp analyses or not.  */
 586
 587 static void
 588 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop,
 589                               bool slp)
 590 {
 591   basic_block bb = loop->header;
 592   tree init, step;
 593   auto_vec<stmt_vec_info, 64> worklist;
 594   gphi_iterator gsi;
 595   bool double_reduc, reduc_chain;
 596
 597   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 598
 599   /* First - identify all inductions.  Reduction detection assumes that all the
 600      inductions have been identified, therefore, this order must not be
 601      changed.  */
 602   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 603     {
 604       gphi *phi = gsi.phi ();
 605       tree access_fn = NULL;
 606       tree def = PHI_RESULT (phi);
 607       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 608
 609       if (dump_enabled_p ())
 610         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
 611                          (gimple *) phi);
 612
 613       /* Skip virtual phi's.  The data dependences that are associated with
 614          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 615       if (virtual_operand_p (def))
 616         continue;
 617
 618       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 619
 620       /* Analyze the evolution function.  */
 621       access_fn = analyze_scalar_evolution (loop, def);
 622       if (access_fn)
 623         {
 624           STRIP_NOPS (access_fn);
 625           if (dump_enabled_p ())
 626             dump_printf_loc (MSG_NOTE, vect_location,
 627                              "Access function of PHI: %T\n", access_fn);
 628           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 629             = initial_condition_in_loop_num (access_fn, loop->num);
 630           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 631             = evolution_part_in_loop_num (access_fn, loop->num);
 632         }
 633
 634       if ((!access_fn
 635            || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
 636            || !vect_is_simple_iv_evolution (loop->num, access_fn,
 637                                             &init, &step)
 638            || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 639                && TREE_CODE (step) != INTEGER_CST))
 640           /* Only handle nonlinear iv for same loop.  */
 641           && (LOOP_VINFO_LOOP (loop_vinfo) != loop
 642               || !vect_is_nonlinear_iv_evolution (loop, stmt_vinfo,
 643                                                   phi, &init, &step)))
 644         {
 645           worklist.safe_push (stmt_vinfo);
 646           continue;
 647         }
 648
 649       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 650                   != NULL_TREE);
 651       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 652
 653       if (dump_enabled_p ())
 654         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 655       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 656     }
 657
 658
 659   /* Second - identify all reductions and nested cycles.  */
 660   while (worklist.length () > 0)
 661     {
 662       stmt_vec_info stmt_vinfo = worklist.pop ();
 663       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
 664       tree def = PHI_RESULT (phi);
 665
 666       if (dump_enabled_p ())
 667         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
 668                          (gimple *) phi);
 669
 670       gcc_assert (!virtual_operand_p (def)
 671                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 672
 673       stmt_vec_info reduc_stmt_info
 674         = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
 675                                     &reduc_chain, slp);
 676       if (reduc_stmt_info)
 677         {
 678           STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
 679           STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
 680           if (double_reduc)
 681             {
 682               if (dump_enabled_p ())
 683                 dump_printf_loc (MSG_NOTE, vect_location,
 684                                  "Detected double reduction.\n");
 685
 686               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 687               STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
 688             }
 689           else
 690             {
 691               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 692                 {
 693                   if (dump_enabled_p ())
 694                     dump_printf_loc (MSG_NOTE, vect_location,
 695                                      "Detected vectorizable nested cycle.\n");
 696
 697                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 698                 }
 699               else
 700                 {
 701                   if (dump_enabled_p ())
 702                     dump_printf_loc (MSG_NOTE, vect_location,
 703                                      "Detected reduction.\n");
 704
 705                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 706                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
 707                   /* Store the reduction cycles for possible vectorization in
 708                      loop-aware SLP if it was not detected as reduction
 709                      chain.  */
 710                   if (! reduc_chain)
 711                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
 712                       (reduc_stmt_info);
 713                 }
 714             }
 715         }
 716       else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi))
 717         STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence;
 718       else
 719         if (dump_enabled_p ())
 720           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 721                            "Unknown def-use cycle pattern.\n");
 722     }
 723 }
 724
 725
 726 /* Function vect_analyze_scalar_cycles.
 727
 728    Examine the cross iteration def-use cycles of scalar variables, by
 729    analyzing the loop-header PHIs of scalar variables.  Classify each
 730    cycle as one of the following: invariant, induction, reduction, unknown.
 731    We do that for the loop represented by LOOP_VINFO, and also to its
 732    inner-loop, if exists.
 733    Examples for scalar cycles:
 734
 735    Example1: reduction:
 736
 737               loop1:
 738               for (i=0; i<N; i++)
 739                  sum += a[i];
 740
 741    Example2: induction:
 742
 743               loop2:
 744               for (i=0; i<N; i++)
 745                  a[i] = i;  */
 746
 747 static void
 748 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo, bool slp)
 749 {
 750   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 751
 752   vect_analyze_scalar_cycles_1 (loop_vinfo, loop, slp);
 753
 754   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 755      Reductions in such inner-loop therefore have different properties than
 756      the reductions in the nest that gets vectorized:
 757      1. When vectorized, they are executed in the same order as in the original
 758         scalar loop, so we can't change the order of computation when
 759         vectorizing them.
 760      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 761         current checks are too strict.  */
 762
 763   if (loop->inner)
 764     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner, slp);
 765 }
 766
 767 /* Transfer group and reduction information from STMT_INFO to its
 768    pattern stmt.  */
 769
 770 static void
 771 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
 772 {
 773   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
 774   stmt_vec_info stmtp;
 775   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
 776               && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
 777   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
 778   do
 779     {
 780       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
 781       gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
 782                            == STMT_VINFO_DEF_TYPE (stmt_info));
 783       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
 784       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
 785       if (stmt_info)
 786         REDUC_GROUP_NEXT_ELEMENT (stmtp)
 787           = STMT_VINFO_RELATED_STMT (stmt_info);
 788     }
 789   while (stmt_info);
 790 }
 791
 792 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 793
 794 static void
 795 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 796 {
 797   stmt_vec_info first;
 798   unsigned i;
 799
 800   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 801     {
 802       stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
 803       while (next)
 804         {
 805           if ((STMT_VINFO_IN_PATTERN_P (next)
 806                != STMT_VINFO_IN_PATTERN_P (first))
 807               || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
 808             break;
 809           next = REDUC_GROUP_NEXT_ELEMENT (next);
 810         }
 811       /* If all reduction chain members are well-formed patterns adjust
 812          the group to group the pattern stmts instead.  */
 813       if (! next
 814           && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
 815         {
 816           if (STMT_VINFO_IN_PATTERN_P (first))
 817             {
 818               vect_fixup_reduc_chain (first);
 819               LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 820                 = STMT_VINFO_RELATED_STMT (first);
 821             }
 822         }
 823       /* If not all stmt in the chain are patterns or if we failed
 824          to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
 825          it as regular reduction instead.  */
 826       else
 827         {
 828           stmt_vec_info vinfo = first;
 829           stmt_vec_info last = NULL;
 830           while (vinfo)
 831             {
 832               next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
 833               REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
 834               REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
 835               last = vinfo;
 836               vinfo = next;
 837             }
 838           STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
 839             = vect_internal_def;
 840           loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
 841           LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
 842           --i;
 843         }
 844     }
 845 }
 846
 847 /* Function vect_get_loop_niters.
 848
 849    Determine how many iterations the loop is executed and place it
 850    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 851    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 852    niter information holds in ASSUMPTIONS.
 853
 854    Return the loop exit conditions.  */
 855
 856
 857 static vec<gcond *>
 858 vect_get_loop_niters (class loop *loop, const_edge main_exit, tree *assumptions,
 859                       tree *number_of_iterations, tree *number_of_iterationsm1)
 860 {
 861   auto_vec<edge> exits = get_loop_exit_edges (loop);
 862   vec<gcond *> conds;
 863   conds.create (exits.length ());
 864   class tree_niter_desc niter_desc;
 865   tree niter_assumptions, niter, may_be_zero;
 866
 867   *assumptions = boolean_true_node;
 868   *number_of_iterationsm1 = chrec_dont_know;
 869   *number_of_iterations = chrec_dont_know;
 870
 871   DUMP_VECT_SCOPE ("get_loop_niters");
 872
 873   if (exits.is_empty ())
 874     return conds;
 875
 876   if (dump_enabled_p ())
 877     dump_printf_loc (MSG_NOTE, vect_location, "Loop has %d exits.\n",
 878                      exits.length ());
 879
 880   edge exit;
 881   unsigned int i;
 882   FOR_EACH_VEC_ELT (exits, i, exit)
 883     {
 884       gcond *cond = get_loop_exit_condition (exit);
 885       if (cond)
 886         conds.safe_push (cond);
 887
 888       if (dump_enabled_p ())
 889         dump_printf_loc (MSG_NOTE, vect_location, "Analyzing exit %d...\n", i);
 890
 891       if (exit != main_exit)
 892         continue;
 893
 894       may_be_zero = NULL_TREE;
 895       if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 896           || chrec_contains_undetermined (niter_desc.niter))
 897         continue;
 898
 899       niter_assumptions = niter_desc.assumptions;
 900       may_be_zero = niter_desc.may_be_zero;
 901       niter = niter_desc.niter;
 902
 903       if (may_be_zero && integer_zerop (may_be_zero))
 904         may_be_zero = NULL_TREE;
 905
 906       if (may_be_zero)
 907         {
 908           if (COMPARISON_CLASS_P (may_be_zero))
 909             {
 910               /* Try to combine may_be_zero with assumptions, this can simplify
 911                  computation of niter expression.  */
 912               if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 913                 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 914                                                  niter_assumptions,
 915                                                  fold_build1 (TRUTH_NOT_EXPR,
 916                                                               boolean_type_node,
 917                                                               may_be_zero));
 918               else
 919                 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 920                                      build_int_cst (TREE_TYPE (niter), 0),
 921                                      rewrite_to_non_trapping_overflow (niter));
 922
 923               may_be_zero = NULL_TREE;
 924             }
 925           else if (integer_nonzerop (may_be_zero))
 926             {
 927               *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 928               *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 929               continue;
 930             }
 931           else
 932             continue;
 933        }
 934
 935       /* Loop assumptions are based off the normal exit.  */
 936       *assumptions = niter_assumptions;
 937       *number_of_iterationsm1 = niter;
 938
 939       /* We want the number of loop header executions which is the number
 940          of latch executions plus one.
 941          ???  For UINT_MAX latch executions this number overflows to zero
 942          for loops like do { n++; } while (n != 0);  */
 943       if (niter && !chrec_contains_undetermined (niter))
 944           niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter),
 945                                unshare_expr (niter),
 946                                build_int_cst (TREE_TYPE (niter), 1));
 947       *number_of_iterations = niter;
 948     }
 949
 950   if (dump_enabled_p ())
 951     dump_printf_loc (MSG_NOTE, vect_location, "All loop exits successfully analyzed.\n");
 952
 953   return conds;
 954 }
 955
 956 /*  Determine the main loop exit for the vectorizer.  */
 957
 958 edge
 959 vec_init_loop_exit_info (class loop *loop)
 960 {
 961   /* Before we begin we must first determine which exit is the main one and
 962      which are auxilary exits.  */
 963   auto_vec<edge> exits = get_loop_exit_edges (loop);
 964   if (exits.length () == 1)
 965     return exits[0];
 966
 967   /* If we have multiple exits we only support counting IV at the moment.  Analyze
 968      all exits and return one */
 969   class tree_niter_desc niter_desc;
 970   edge candidate = NULL;
 971   for (edge exit : exits)
 972     {
 973       if (!get_loop_exit_condition (exit))
 974         continue;
 975
 976       if (number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 977           && !chrec_contains_undetermined (niter_desc.niter))
 978         {
 979           if (!niter_desc.may_be_zero || !candidate)
 980             candidate = exit;
 981         }
 982     }
 983
 984   return candidate;
 985 }
 986
 987 /* Function bb_in_loop_p
 988
 989    Used as predicate for dfs order traversal of the loop bbs.  */
 990
 991 static bool
 992 bb_in_loop_p (const_basic_block bb, const void *data)
 993 {
 994   const class loop *const loop = (const class loop *)data;
 995   if (flow_bb_inside_loop_p (loop, bb))
 996     return true;
 997   return false;
 998 }
 999
1000
1001 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1002    stmt_vec_info structs for all the stmts in LOOP_IN.  */
1003
1004 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
1005   : vec_info (vec_info::loop, shared),
1006     loop (loop_in),
1007     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
1008     num_itersm1 (NULL_TREE),
1009     num_iters (NULL_TREE),
1010     num_iters_unchanged (NULL_TREE),
1011     num_iters_assumptions (NULL_TREE),
1012     vector_costs (nullptr),
1013     scalar_costs (nullptr),
1014     th (0),
1015     versioning_threshold (0),
1016     vectorization_factor (0),
1017     main_loop_edge (nullptr),
1018     skip_main_loop_edge (nullptr),
1019     skip_this_loop_edge (nullptr),
1020     reusable_accumulators (),
1021     suggested_unroll_factor (1),
1022     max_vectorization_factor (0),
1023     mask_skip_niters (NULL_TREE),
1024     rgroup_compare_type (NULL_TREE),
1025     simd_if_cond (NULL_TREE),
1026     partial_vector_style (vect_partial_vectors_none),
1027     unaligned_dr (NULL),
1028     peeling_for_alignment (0),
1029     ptr_mask (0),
1030     ivexpr_map (NULL),
1031     scan_map (NULL),
1032     slp_unrolling_factor (1),
1033     inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
1034     vectorizable (false),
1035     can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
1036     using_partial_vectors_p (false),
1037     using_decrementing_iv_p (false),
1038     using_select_vl_p (false),
1039     epil_using_partial_vectors_p (false),
1040     partial_load_store_bias (0),
1041     peeling_for_gaps (false),
1042     peeling_for_niter (false),
1043     no_data_dependencies (false),
1044     has_mask_store (false),
1045     scalar_loop_scaling (profile_probability::uninitialized ()),
1046     scalar_loop (NULL),
1047     orig_loop_info (NULL),
1048     vec_loop_iv_exit (NULL),
1049     vec_epilogue_loop_iv_exit (NULL),
1050     scalar_loop_iv_exit (NULL)
1051 {
1052   /* CHECKME: We want to visit all BBs before their successors (except for
1053      latch blocks, for which this assertion wouldn't hold).  In the simple
1054      case of the loop forms we allow, a dfs order of the BBs would the same
1055      as reversed postorder traversal, so we are safe.  */
1056
1057   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1058                                           bbs, loop->num_nodes, loop);
1059   gcc_assert (nbbs == loop->num_nodes);
1060
1061   for (unsigned int i = 0; i < nbbs; i++)
1062     {
1063       basic_block bb = bbs[i];
1064       gimple_stmt_iterator si;
1065
1066       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1067         {
1068           gimple *phi = gsi_stmt (si);
1069           gimple_set_uid (phi, 0);
1070           add_stmt (phi);
1071         }
1072
1073       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1074         {
1075           gimple *stmt = gsi_stmt (si);
1076           gimple_set_uid (stmt, 0);
1077           if (is_gimple_debug (stmt))
1078             continue;
1079           add_stmt (stmt);
1080           /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
1081              third argument is the #pragma omp simd if (x) condition, when 0,
1082              loop shouldn't be vectorized, when non-zero constant, it should
1083              be vectorized normally, otherwise versioned with vectorized loop
1084              done if the condition is non-zero at runtime.  */
1085           if (loop_in->simduid
1086               && is_gimple_call (stmt)
1087               && gimple_call_internal_p (stmt)
1088               && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
1089               && gimple_call_num_args (stmt) >= 3
1090               && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
1091               && (loop_in->simduid
1092                   == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
1093             {
1094               tree arg = gimple_call_arg (stmt, 2);
1095               if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
1096                 simd_if_cond = arg;
1097               else
1098                 gcc_assert (integer_nonzerop (arg));
1099             }
1100         }
1101     }
1102
1103   epilogue_vinfos.create (6);
1104 }
1105
1106 /* Free all levels of rgroup CONTROLS.  */
1107
1108 void
1109 release_vec_loop_controls (vec<rgroup_controls> *controls)
1110 {
1111   rgroup_controls *rgc;
1112   unsigned int i;
1113   FOR_EACH_VEC_ELT (*controls, i, rgc)
1114     rgc->controls.release ();
1115   controls->release ();
1116 }
1117
1118 /* Free all memory used by the _loop_vec_info, as well as all the
1119    stmt_vec_info structs of all the stmts in the loop.  */
1120
1121 _loop_vec_info::~_loop_vec_info ()
1122 {
1123   free (bbs);
1124
1125   release_vec_loop_controls (&masks.rgc_vec);
1126   release_vec_loop_controls (&lens);
1127   delete ivexpr_map;
1128   delete scan_map;
1129   epilogue_vinfos.release ();
1130   delete scalar_costs;
1131   delete vector_costs;
1132
1133   /* When we release an epiloge vinfo that we do not intend to use
1134      avoid clearing AUX of the main loop which should continue to
1135      point to the main loop vinfo since otherwise we'll leak that.  */
1136   if (loop->aux == this)
1137     loop->aux = NULL;
1138 }
1139
1140 /* Return an invariant or register for EXPR and emit necessary
1141    computations in the LOOP_VINFO loop preheader.  */
1142
1143 tree
1144 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
1145 {
1146   if (is_gimple_reg (expr)
1147       || is_gimple_min_invariant (expr))
1148     return expr;
1149
1150   if (! loop_vinfo->ivexpr_map)
1151     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
1152   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
1153   if (! cached)
1154     {
1155       gimple_seq stmts = NULL;
1156       cached = force_gimple_operand (unshare_expr (expr),
1157                                      &stmts, true, NULL_TREE);
1158       if (stmts)
1159         {
1160           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1161           gsi_insert_seq_on_edge_immediate (e, stmts);
1162         }
1163     }
1164   return cached;
1165 }
1166
1167 /* Return true if we can use CMP_TYPE as the comparison type to produce
1168    all masks required to mask LOOP_VINFO.  */
1169
1170 static bool
1171 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1172 {
1173   rgroup_controls *rgm;
1174   unsigned int i;
1175   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1176     if (rgm->type != NULL_TREE
1177         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1178                                             cmp_type, rgm->type,
1179                                             OPTIMIZE_FOR_SPEED))
1180       return false;
1181   return true;
1182 }
1183
1184 /* Calculate the maximum number of scalars per iteration for every
1185    rgroup in LOOP_VINFO.  */
1186
1187 static unsigned int
1188 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1189 {
1190   unsigned int res = 1;
1191   unsigned int i;
1192   rgroup_controls *rgm;
1193   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1194     res = MAX (res, rgm->max_nscalars_per_iter);
1195   return res;
1196 }
1197
1198 /* Calculate the minimum precision necessary to represent:
1199
1200       MAX_NITERS * FACTOR
1201
1202    as an unsigned integer, where MAX_NITERS is the maximum number of
1203    loop header iterations for the original scalar form of LOOP_VINFO.  */
1204
1205 static unsigned
1206 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1207 {
1208   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1209
1210   /* Get the maximum number of iterations that is representable
1211      in the counter type.  */
1212   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1213   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1214
1215   /* Get a more refined estimate for the number of iterations.  */
1216   widest_int max_back_edges;
1217   if (max_loop_iterations (loop, &max_back_edges))
1218     max_ni = wi::smin (max_ni, max_back_edges + 1);
1219
1220   /* Work out how many bits we need to represent the limit.  */
1221   return wi::min_precision (max_ni * factor, UNSIGNED);
1222 }
1223
1224 /* True if the loop needs peeling or partial vectors when vectorized.  */
1225
1226 static bool
1227 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1228 {
1229   unsigned HOST_WIDE_INT const_vf;
1230   HOST_WIDE_INT max_niter
1231     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1232
1233   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1234   if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1235     th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1236                                           (loop_vinfo));
1237
1238   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1239       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1240     {
1241       /* Work out the (constant) number of iterations that need to be
1242          peeled for reasons other than niters.  */
1243       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1244       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1245         peel_niter += 1;
1246       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1247                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1248         return true;
1249     }
1250   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1251       /* ??? When peeling for gaps but not alignment, we could
1252          try to check whether the (variable) niters is known to be
1253          VF * N + 1.  That's something of a niche case though.  */
1254       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1255       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1256       || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1257            < (unsigned) exact_log2 (const_vf))
1258           /* In case of versioning, check if the maximum number of
1259              iterations is greater than th.  If they are identical,
1260              the epilogue is unnecessary.  */
1261           && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1262               || ((unsigned HOST_WIDE_INT) max_niter
1263                   > (th / const_vf) * const_vf))))
1264     return true;
1265
1266   return false;
1267 }
1268
1269 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1270    whether we can actually generate the masks required.  Return true if so,
1271    storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE.  */
1272
1273 static bool
1274 vect_verify_full_masking (loop_vec_info loop_vinfo)
1275 {
1276   unsigned int min_ni_width;
1277
1278   /* Use a normal loop if there are no statements that need masking.
1279      This only happens in rare degenerate cases: it means that the loop
1280      has no loads, no stores, and no live-out values.  */
1281   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1282     return false;
1283
1284   /* Produce the rgroup controls.  */
1285   for (auto mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1286     {
1287       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1288       tree vectype = mask.first;
1289       unsigned nvectors = mask.second;
1290
1291       if (masks->rgc_vec.length () < nvectors)
1292         masks->rgc_vec.safe_grow_cleared (nvectors, true);
1293       rgroup_controls *rgm = &(*masks).rgc_vec[nvectors - 1];
1294       /* The number of scalars per iteration and the number of vectors are
1295          both compile-time constants.  */
1296       unsigned int nscalars_per_iter
1297           = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1298                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1299
1300       if (rgm->max_nscalars_per_iter < nscalars_per_iter)
1301         {
1302           rgm->max_nscalars_per_iter = nscalars_per_iter;
1303           rgm->type = truth_type_for (vectype);
1304           rgm->factor = 1;
1305         }
1306     }
1307
1308   unsigned int max_nscalars_per_iter
1309     = vect_get_max_nscalars_per_iter (loop_vinfo);
1310
1311   /* Work out how many bits we need to represent the limit.  */
1312   min_ni_width
1313     = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1314
1315   /* Find a scalar mode for which WHILE_ULT is supported.  */
1316   opt_scalar_int_mode cmp_mode_iter;
1317   tree cmp_type = NULL_TREE;
1318   tree iv_type = NULL_TREE;
1319   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1320   unsigned int iv_precision = UINT_MAX;
1321
1322   if (iv_limit != -1)
1323     iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1324                                       UNSIGNED);
1325
1326   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1327     {
1328       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1329       if (cmp_bits >= min_ni_width
1330           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1331         {
1332           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1333           if (this_type
1334               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1335             {
1336               /* Although we could stop as soon as we find a valid mode,
1337                  there are at least two reasons why that's not always the
1338                  best choice:
1339
1340                  - An IV that's Pmode or wider is more likely to be reusable
1341                    in address calculations than an IV that's narrower than
1342                    Pmode.
1343
1344                  - Doing the comparison in IV_PRECISION or wider allows
1345                    a natural 0-based IV, whereas using a narrower comparison
1346                    type requires mitigations against wrap-around.
1347
1348                  Conversely, if the IV limit is variable, doing the comparison
1349                  in a wider type than the original type can introduce
1350                  unnecessary extensions, so picking the widest valid mode
1351                  is not always a good choice either.
1352
1353                  Here we prefer the first IV type that's Pmode or wider,
1354                  and the first comparison type that's IV_PRECISION or wider.
1355                  (The comparison type must be no wider than the IV type,
1356                  to avoid extensions in the vector loop.)
1357
1358                  ??? We might want to try continuing beyond Pmode for ILP32
1359                  targets if CMP_BITS < IV_PRECISION.  */
1360               iv_type = this_type;
1361               if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1362                 cmp_type = this_type;
1363               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1364                 break;
1365             }
1366         }
1367     }
1368
1369   if (!cmp_type)
1370     {
1371       LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.release ();
1372       return false;
1373     }
1374
1375   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1376   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1377   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_while_ult;
1378   return true;
1379 }
1380
1381 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1382    whether we can actually generate AVX512 style masks.  Return true if so,
1383    storing the type of the scalar IV in LOOP_VINFO_RGROUP_IV_TYPE.  */
1384
1385 static bool
1386 vect_verify_full_masking_avx512 (loop_vec_info loop_vinfo)
1387 {
1388   /* Produce differently organized rgc_vec and differently check
1389      we can produce masks.  */
1390
1391   /* Use a normal loop if there are no statements that need masking.
1392      This only happens in rare degenerate cases: it means that the loop
1393      has no loads, no stores, and no live-out values.  */
1394   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1395     return false;
1396
1397   /* For the decrementing IV we need to represent all values in
1398      [0, niter + niter_skip] where niter_skip is the elements we
1399      skip in the first iteration for prologue peeling.  */
1400   tree iv_type = NULL_TREE;
1401   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1402   unsigned int iv_precision = UINT_MAX;
1403   if (iv_limit != -1)
1404     iv_precision = wi::min_precision (iv_limit, UNSIGNED);
1405
1406   /* First compute the type for the IV we use to track the remaining
1407      scalar iterations.  */
1408   opt_scalar_int_mode cmp_mode_iter;
1409   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1410     {
1411       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1412       if (cmp_bits >= iv_precision
1413           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1414         {
1415           iv_type = build_nonstandard_integer_type (cmp_bits, true);
1416           if (iv_type)
1417             break;
1418         }
1419     }
1420   if (!iv_type)
1421     return false;
1422
1423   /* Produce the rgroup controls.  */
1424   for (auto const &mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1425     {
1426       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1427       tree vectype = mask.first;
1428       unsigned nvectors = mask.second;
1429
1430       /* The number of scalars per iteration and the number of vectors are
1431          both compile-time constants.  */
1432       unsigned int nscalars_per_iter
1433         = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1434                      LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1435
1436       /* We index the rgroup_controls vector with nscalars_per_iter
1437          which we keep constant and instead have a varying nvectors,
1438          remembering the vector mask with the fewest nV.  */
1439       if (masks->rgc_vec.length () < nscalars_per_iter)
1440         masks->rgc_vec.safe_grow_cleared (nscalars_per_iter, true);
1441       rgroup_controls *rgm = &(*masks).rgc_vec[nscalars_per_iter - 1];
1442
1443       if (!rgm->type || rgm->factor > nvectors)
1444         {
1445           rgm->type = truth_type_for (vectype);
1446           rgm->compare_type = NULL_TREE;
1447           rgm->max_nscalars_per_iter = nscalars_per_iter;
1448           rgm->factor = nvectors;
1449           rgm->bias_adjusted_ctrl = NULL_TREE;
1450         }
1451     }
1452
1453   /* There is no fixed compare type we are going to use but we have to
1454      be able to get at one for each mask group.  */
1455   unsigned int min_ni_width
1456     = wi::min_precision (vect_max_vf (loop_vinfo), UNSIGNED);
1457
1458   bool ok = true;
1459   for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
1460     {
1461       tree mask_type = rgc.type;
1462       if (!mask_type)
1463         continue;
1464
1465       if (TYPE_PRECISION (TREE_TYPE (mask_type)) != 1)
1466         {
1467           ok = false;
1468           break;
1469         }
1470
1471       /* If iv_type is usable as compare type use that - we can elide the
1472          saturation in that case.   */
1473       if (TYPE_PRECISION (iv_type) >= min_ni_width)
1474         {
1475           tree cmp_vectype
1476             = build_vector_type (iv_type, TYPE_VECTOR_SUBPARTS (mask_type));
1477           if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1478             rgc.compare_type = cmp_vectype;
1479         }
1480       if (!rgc.compare_type)
1481         FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1482           {
1483             unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1484             if (cmp_bits >= min_ni_width
1485                 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1486               {
1487                 tree cmp_type = build_nonstandard_integer_type (cmp_bits, true);
1488                 if (!cmp_type)
1489                   continue;
1490
1491                 /* Check whether we can produce the mask with cmp_type.  */
1492                 tree cmp_vectype
1493                   = build_vector_type (cmp_type, TYPE_VECTOR_SUBPARTS (mask_type));
1494                 if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1495                   {
1496                     rgc.compare_type = cmp_vectype;
1497                     break;
1498                   }
1499               }
1500         }
1501       if (!rgc.compare_type)
1502         {
1503           ok = false;
1504           break;
1505         }
1506     }
1507   if (!ok)
1508     {
1509       release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
1510       return false;
1511     }
1512
1513   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = error_mark_node;
1514   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1515   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_avx512;
1516   return true;
1517 }
1518
1519 /* Check whether we can use vector access with length based on precison
1520    comparison.  So far, to keep it simple, we only allow the case that the
1521    precision of the target supported length is larger than the precision
1522    required by loop niters.  */
1523
1524 static bool
1525 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1526 {
1527   if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1528     return false;
1529
1530   machine_mode len_load_mode, len_store_mode;
1531   if (!get_len_load_store_mode (loop_vinfo->vector_mode, true)
1532          .exists (&len_load_mode))
1533     return false;
1534   if (!get_len_load_store_mode (loop_vinfo->vector_mode, false)
1535          .exists (&len_store_mode))
1536     return false;
1537
1538   signed char partial_load_bias = internal_len_load_store_bias
1539     (IFN_LEN_LOAD, len_load_mode);
1540
1541   signed char partial_store_bias = internal_len_load_store_bias
1542     (IFN_LEN_STORE, len_store_mode);
1543
1544   gcc_assert (partial_load_bias == partial_store_bias);
1545
1546   if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1547     return false;
1548
1549   /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1550      len_loads with a length of zero.  In order to avoid that we prohibit
1551      more than one loop length here.  */
1552   if (partial_load_bias == -1
1553       && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1554     return false;
1555
1556   LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1557
1558   unsigned int max_nitems_per_iter = 1;
1559   unsigned int i;
1560   rgroup_controls *rgl;
1561   /* Find the maximum number of items per iteration for every rgroup.  */
1562   FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1563     {
1564       unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1565       max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1566     }
1567
1568   /* Work out how many bits we need to represent the length limit.  */
1569   unsigned int min_ni_prec
1570     = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1571
1572   /* Now use the maximum of below precisions for one suitable IV type:
1573      - the IV's natural precision
1574      - the precision needed to hold: the maximum number of scalar
1575        iterations multiplied by the scale factor (min_ni_prec above)
1576      - the Pmode precision
1577
1578      If min_ni_prec is less than the precision of the current niters,
1579      we perfer to still use the niters type.  Prefer to use Pmode and
1580      wider IV to avoid narrow conversions.  */
1581
1582   unsigned int ni_prec
1583     = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1584   min_ni_prec = MAX (min_ni_prec, ni_prec);
1585   min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1586
1587   tree iv_type = NULL_TREE;
1588   opt_scalar_int_mode tmode_iter;
1589   FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1590     {
1591       scalar_mode tmode = tmode_iter.require ();
1592       unsigned int tbits = GET_MODE_BITSIZE (tmode);
1593
1594       /* ??? Do we really want to construct one IV whose precision exceeds
1595          BITS_PER_WORD?  */
1596       if (tbits > BITS_PER_WORD)
1597         break;
1598
1599       /* Find the first available standard integral type.  */
1600       if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1601         {
1602           iv_type = build_nonstandard_integer_type (tbits, true);
1603           break;
1604         }
1605     }
1606
1607   if (!iv_type)
1608     {
1609       if (dump_enabled_p ())
1610         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1611                          "can't vectorize with length-based partial vectors"
1612                          " because there is no suitable iv type.\n");
1613       return false;
1614     }
1615
1616   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1617   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1618   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_len;
1619
1620   return true;
1621 }
1622
1623 /* Calculate the cost of one scalar iteration of the loop.  */
1624 static void
1625 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1626 {
1627   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1628   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1629   int nbbs = loop->num_nodes, factor;
1630   int innerloop_iters, i;
1631
1632   DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1633
1634   /* Gather costs for statements in the scalar loop.  */
1635
1636   /* FORNOW.  */
1637   innerloop_iters = 1;
1638   if (loop->inner)
1639     innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1640
1641   for (i = 0; i < nbbs; i++)
1642     {
1643       gimple_stmt_iterator si;
1644       basic_block bb = bbs[i];
1645
1646       if (bb->loop_father == loop->inner)
1647         factor = innerloop_iters;
1648       else
1649         factor = 1;
1650
1651       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1652         {
1653           gimple *stmt = gsi_stmt (si);
1654           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1655
1656           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1657             continue;
1658
1659           /* Skip stmts that are not vectorized inside the loop.  */
1660           stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1661           if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1662               && (!STMT_VINFO_LIVE_P (vstmt_info)
1663                   || !VECTORIZABLE_CYCLE_DEF
1664                         (STMT_VINFO_DEF_TYPE (vstmt_info))))
1665             continue;
1666
1667           vect_cost_for_stmt kind;
1668           if (STMT_VINFO_DATA_REF (stmt_info))
1669             {
1670               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1671                kind = scalar_load;
1672              else
1673                kind = scalar_store;
1674             }
1675           else if (vect_nop_conversion_p (stmt_info))
1676             continue;
1677           else
1678             kind = scalar_stmt;
1679
1680           /* We are using vect_prologue here to avoid scaling twice
1681              by the inner loop factor.  */
1682           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1683                             factor, kind, stmt_info, 0, vect_prologue);
1684         }
1685     }
1686
1687   /* Now accumulate cost.  */
1688   loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1689   add_stmt_costs (loop_vinfo->scalar_costs,
1690                   &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1691   loop_vinfo->scalar_costs->finish_cost (nullptr);
1692 }
1693
1694
1695 /* Function vect_analyze_loop_form.
1696
1697    Verify that certain CFG restrictions hold, including:
1698    - the loop has a pre-header
1699    - the loop has a single entry and exit
1700    - the loop exit condition is simple enough
1701    - the number of iterations can be analyzed, i.e, a countable loop.  The
1702      niter could be analyzed under some assumptions.  */
1703
1704 opt_result
1705 vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info)
1706 {
1707   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1708
1709   edge exit_e = vec_init_loop_exit_info (loop);
1710   if (!exit_e)
1711     return opt_result::failure_at (vect_location,
1712                                    "not vectorized:"
1713                                    " could not determine main exit from"
1714                                    " loop with multiple exits.\n");
1715   info->loop_exit = exit_e;
1716   if (dump_enabled_p ())
1717       dump_printf_loc (MSG_NOTE, vect_location,
1718                        "using as main loop exit: %d -> %d [AUX: %p]\n",
1719                        exit_e->src->index, exit_e->dest->index, exit_e->aux);
1720
1721   /* Different restrictions apply when we are considering an inner-most loop,
1722      vs. an outer (nested) loop.
1723      (FORNOW. May want to relax some of these restrictions in the future).  */
1724
1725   info->inner_loop_cond = NULL;
1726   if (!loop->inner)
1727     {
1728       /* Inner-most loop.  We currently require that the number of BBs is
1729          exactly 2 (the header and latch).  Vectorizable inner-most loops
1730          look like this:
1731
1732                         (pre-header)
1733                            |
1734                           header <--------+
1735                            | |            |
1736                            | +--> latch --+
1737                            |
1738                         (exit-bb)  */
1739
1740       if (loop->num_nodes != 2)
1741         return opt_result::failure_at (vect_location,
1742                                        "not vectorized:"
1743                                        " control flow in loop.\n");
1744
1745       if (empty_block_p (loop->header))
1746         return opt_result::failure_at (vect_location,
1747                                        "not vectorized: empty loop.\n");
1748     }
1749   else
1750     {
1751       class loop *innerloop = loop->inner;
1752       edge entryedge;
1753
1754       /* Nested loop. We currently require that the loop is doubly-nested,
1755          contains a single inner loop, and the number of BBs is exactly 5.
1756          Vectorizable outer-loops look like this:
1757
1758                         (pre-header)
1759                            |
1760                           header <---+
1761                            |         |
1762                           inner-loop |
1763                            |         |
1764                           tail ------+
1765                            |
1766                         (exit-bb)
1767
1768          The inner-loop has the properties expected of inner-most loops
1769          as described above.  */
1770
1771       if ((loop->inner)->inner || (loop->inner)->next)
1772         return opt_result::failure_at (vect_location,
1773                                        "not vectorized:"
1774                                        " multiple nested loops.\n");
1775
1776       if (loop->num_nodes != 5)
1777         return opt_result::failure_at (vect_location,
1778                                        "not vectorized:"
1779                                        " control flow in loop.\n");
1780
1781       entryedge = loop_preheader_edge (innerloop);
1782       if (entryedge->src != loop->header
1783           || !single_exit (innerloop)
1784           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1785         return opt_result::failure_at (vect_location,
1786                                        "not vectorized:"
1787                                        " unsupported outerloop form.\n");
1788
1789       /* Analyze the inner-loop.  */
1790       vect_loop_form_info inner;
1791       opt_result res = vect_analyze_loop_form (loop->inner, &inner);
1792       if (!res)
1793         {
1794           if (dump_enabled_p ())
1795             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1796                              "not vectorized: Bad inner loop.\n");
1797           return res;
1798         }
1799
1800       /* Don't support analyzing niter under assumptions for inner
1801          loop.  */
1802       if (!integer_onep (inner.assumptions))
1803         return opt_result::failure_at (vect_location,
1804                                        "not vectorized: Bad inner loop.\n");
1805
1806       if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1807         return opt_result::failure_at (vect_location,
1808                                        "not vectorized: inner-loop count not"
1809                                        " invariant.\n");
1810
1811       if (dump_enabled_p ())
1812         dump_printf_loc (MSG_NOTE, vect_location,
1813                          "Considering outer-loop vectorization.\n");
1814       info->inner_loop_cond = inner.conds[0];
1815     }
1816
1817   if (!single_exit (loop))
1818     return opt_result::failure_at (vect_location,
1819                                    "not vectorized: multiple exits.\n");
1820   if (EDGE_COUNT (loop->header->preds) != 2)
1821     return opt_result::failure_at (vect_location,
1822                                    "not vectorized:"
1823                                    " too many incoming edges.\n");
1824
1825   /* We assume that the loop exit condition is at the end of the loop. i.e,
1826      that the loop is represented as a do-while (with a proper if-guard
1827      before the loop if needed), where the loop header contains all the
1828      executable statements, and the latch is empty.  */
1829   if (!empty_block_p (loop->latch)
1830       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1831     return opt_result::failure_at (vect_location,
1832                                    "not vectorized: latch block not empty.\n");
1833
1834   /* Make sure the exit is not abnormal.  */
1835   if (exit_e->flags & EDGE_ABNORMAL)
1836     return opt_result::failure_at (vect_location,
1837                                    "not vectorized:"
1838                                    " abnormal loop exit edge.\n");
1839
1840   info->conds
1841     = vect_get_loop_niters (loop, exit_e, &info->assumptions,
1842                             &info->number_of_iterations,
1843                             &info->number_of_iterationsm1);
1844
1845   if (info->conds.is_empty ())
1846     return opt_result::failure_at
1847       (vect_location,
1848        "not vectorized: complicated exit condition.\n");
1849
1850   /* Determine what the primary and alternate exit conds are.  */
1851   for (unsigned i = 0; i < info->conds.length (); i++)
1852     {
1853       gcond *cond = info->conds[i];
1854       if (exit_e->src == gimple_bb (cond))
1855         std::swap (info->conds[0], info->conds[i]);
1856     }
1857
1858   if (integer_zerop (info->assumptions)
1859       || !info->number_of_iterations
1860       || chrec_contains_undetermined (info->number_of_iterations))
1861     return opt_result::failure_at
1862       (info->conds[0],
1863        "not vectorized: number of iterations cannot be computed.\n");
1864
1865   if (integer_zerop (info->number_of_iterations))
1866     return opt_result::failure_at
1867       (info->conds[0],
1868        "not vectorized: number of iterations = 0.\n");
1869
1870   if (!(tree_fits_shwi_p (info->number_of_iterations)
1871         && tree_to_shwi (info->number_of_iterations) > 0))
1872     {
1873       if (dump_enabled_p ())
1874         {
1875           dump_printf_loc (MSG_NOTE, vect_location,
1876                            "Symbolic number of iterations is ");
1877           dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1878           dump_printf (MSG_NOTE, "\n");
1879         }
1880     }
1881
1882   return opt_result::success ();
1883 }
1884
1885 /* Create a loop_vec_info for LOOP with SHARED and the
1886    vect_analyze_loop_form result.  */
1887
1888 loop_vec_info
1889 vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1890                         const vect_loop_form_info *info,
1891                         loop_vec_info main_loop_info)
1892 {
1893   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1894   LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1895   LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1896   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1897   LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = main_loop_info;
1898   /* Also record the assumptions for versioning.  */
1899   if (!integer_onep (info->assumptions) && !main_loop_info)
1900     LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1901
1902   for (gcond *cond : info->conds)
1903     {
1904       stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (cond);
1905       STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1906     }
1907
1908   for (unsigned i = 1; i < info->conds.length (); i ++)
1909     LOOP_VINFO_LOOP_CONDS (loop_vinfo).safe_push (info->conds[i]);
1910   LOOP_VINFO_LOOP_IV_COND (loop_vinfo) = info->conds[0];
1911
1912   LOOP_VINFO_IV_EXIT (loop_vinfo) = info->loop_exit;
1913
1914   if (info->inner_loop_cond)
1915     {
1916       stmt_vec_info inner_loop_cond_info
1917         = loop_vinfo->lookup_stmt (info->inner_loop_cond);
1918       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1919       /* If we have an estimate on the number of iterations of the inner
1920          loop use that to limit the scale for costing, otherwise use
1921          --param vect-inner-loop-cost-factor literally.  */
1922       widest_int nit;
1923       if (estimated_stmt_executions (loop->inner, &nit))
1924         LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1925           = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1926     }
1927
1928   return loop_vinfo;
1929 }
1930
1931
1932
1933 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1934    statements update the vectorization factor.  */
1935
1936 static void
1937 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1938 {
1939   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1940   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1941   int nbbs = loop->num_nodes;
1942   poly_uint64 vectorization_factor;
1943   int i;
1944
1945   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1946
1947   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1948   gcc_assert (known_ne (vectorization_factor, 0U));
1949
1950   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1951      vectorization factor of the loop is the unrolling factor required by
1952      the SLP instances.  If that unrolling factor is 1, we say, that we
1953      perform pure SLP on loop - cross iteration parallelism is not
1954      exploited.  */
1955   bool only_slp_in_loop = true;
1956   for (i = 0; i < nbbs; i++)
1957     {
1958       basic_block bb = bbs[i];
1959       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1960            gsi_next (&si))
1961         {
1962           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1963           if (!stmt_info)
1964             continue;
1965           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1966                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1967               && !PURE_SLP_STMT (stmt_info))
1968             /* STMT needs both SLP and loop-based vectorization.  */
1969             only_slp_in_loop = false;
1970         }
1971       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1972            gsi_next (&si))
1973         {
1974           if (is_gimple_debug (gsi_stmt (si)))
1975             continue;
1976           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1977           stmt_info = vect_stmt_to_vectorize (stmt_info);
1978           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1979                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1980               && !PURE_SLP_STMT (stmt_info))
1981             /* STMT needs both SLP and loop-based vectorization.  */
1982             only_slp_in_loop = false;
1983         }
1984     }
1985
1986   if (only_slp_in_loop)
1987     {
1988       if (dump_enabled_p ())
1989         dump_printf_loc (MSG_NOTE, vect_location,
1990                          "Loop contains only SLP stmts\n");
1991       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1992     }
1993   else
1994     {
1995       if (dump_enabled_p ())
1996         dump_printf_loc (MSG_NOTE, vect_location,
1997                          "Loop contains SLP and non-SLP stmts\n");
1998       /* Both the vectorization factor and unroll factor have the form
1999          GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
2000          so they must have a common multiple.  */
2001       vectorization_factor
2002         = force_common_multiple (vectorization_factor,
2003                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
2004     }
2005
2006   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
2007   if (dump_enabled_p ())
2008     {
2009       dump_printf_loc (MSG_NOTE, vect_location,
2010                        "Updating vectorization factor to ");
2011       dump_dec (MSG_NOTE, vectorization_factor);
2012       dump_printf (MSG_NOTE, ".\n");
2013     }
2014 }
2015
2016 /* Return true if STMT_INFO describes a double reduction phi and if
2017    the other phi in the reduction is also relevant for vectorization.
2018    This rejects cases such as:
2019
2020       outer1:
2021         x_1 = PHI <x_3(outer2), ...>;
2022         ...
2023
2024       inner:
2025         x_2 = ...;
2026         ...
2027
2028       outer2:
2029         x_3 = PHI <x_2(inner)>;
2030
2031    if nothing in x_2 or elsewhere makes x_1 relevant.  */
2032
2033 static bool
2034 vect_active_double_reduction_p (stmt_vec_info stmt_info)
2035 {
2036   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2037     return false;
2038
2039   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
2040 }
2041
2042 /* Function vect_analyze_loop_operations.
2043
2044    Scan the loop stmts and make sure they are all vectorizable.  */
2045
2046 static opt_result
2047 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
2048 {
2049   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2050   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2051   int nbbs = loop->num_nodes;
2052   int i;
2053   stmt_vec_info stmt_info;
2054   bool need_to_vectorize = false;
2055   bool ok;
2056
2057   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
2058
2059   auto_vec<stmt_info_for_cost> cost_vec;
2060
2061   for (i = 0; i < nbbs; i++)
2062     {
2063       basic_block bb = bbs[i];
2064
2065       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
2066            gsi_next (&si))
2067         {
2068           gphi *phi = si.phi ();
2069           ok = true;
2070
2071           stmt_info = loop_vinfo->lookup_stmt (phi);
2072           if (dump_enabled_p ())
2073             dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G",
2074                              (gimple *) phi);
2075           if (virtual_operand_p (gimple_phi_result (phi)))
2076             continue;
2077
2078           /* Inner-loop loop-closed exit phi in outer-loop vectorization
2079              (i.e., a phi in the tail of the outer-loop).  */
2080           if (! is_loop_header_bb_p (bb))
2081             {
2082               /* FORNOW: we currently don't support the case that these phis
2083                  are not used in the outerloop (unless it is double reduction,
2084                  i.e., this phi is vect_reduction_def), cause this case
2085                  requires to actually do something here.  */
2086               if (STMT_VINFO_LIVE_P (stmt_info)
2087                   && !vect_active_double_reduction_p (stmt_info))
2088                 return opt_result::failure_at (phi,
2089                                                "Unsupported loop-closed phi"
2090                                                " in outer-loop.\n");
2091
2092               /* If PHI is used in the outer loop, we check that its operand
2093                  is defined in the inner loop.  */
2094               if (STMT_VINFO_RELEVANT_P (stmt_info))
2095                 {
2096                   tree phi_op;
2097
2098                   if (gimple_phi_num_args (phi) != 1)
2099                     return opt_result::failure_at (phi, "unsupported phi");
2100
2101                   phi_op = PHI_ARG_DEF (phi, 0);
2102                   stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
2103                   if (!op_def_info)
2104                     return opt_result::failure_at (phi, "unsupported phi\n");
2105
2106                   if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
2107                       && (STMT_VINFO_RELEVANT (op_def_info)
2108                           != vect_used_in_outer_by_reduction))
2109                     return opt_result::failure_at (phi, "unsupported phi\n");
2110
2111                   if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2112                        || (STMT_VINFO_DEF_TYPE (stmt_info)
2113                            == vect_double_reduction_def))
2114                       && !vectorizable_lc_phi (loop_vinfo,
2115                                                stmt_info, NULL, NULL))
2116                     return opt_result::failure_at (phi, "unsupported phi\n");
2117                 }
2118
2119               continue;
2120             }
2121
2122           gcc_assert (stmt_info);
2123
2124           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
2125                || STMT_VINFO_LIVE_P (stmt_info))
2126               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def
2127               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
2128             /* A scalar-dependence cycle that we don't support.  */
2129             return opt_result::failure_at (phi,
2130                                            "not vectorized:"
2131                                            " scalar dependence cycle.\n");
2132
2133           if (STMT_VINFO_RELEVANT_P (stmt_info))
2134             {
2135               need_to_vectorize = true;
2136               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
2137                   && ! PURE_SLP_STMT (stmt_info))
2138                 ok = vectorizable_induction (loop_vinfo,
2139                                              stmt_info, NULL, NULL,
2140                                              &cost_vec);
2141               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2142                         || (STMT_VINFO_DEF_TYPE (stmt_info)
2143                             == vect_double_reduction_def)
2144                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
2145                        && ! PURE_SLP_STMT (stmt_info))
2146                 ok = vectorizable_reduction (loop_vinfo,
2147                                              stmt_info, NULL, NULL, &cost_vec);
2148               else if ((STMT_VINFO_DEF_TYPE (stmt_info)
2149                         == vect_first_order_recurrence)
2150                        && ! PURE_SLP_STMT (stmt_info))
2151                 ok = vectorizable_recurr (loop_vinfo, stmt_info, NULL, NULL,
2152                                            &cost_vec);
2153             }
2154
2155           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
2156           if (ok
2157               && STMT_VINFO_LIVE_P (stmt_info)
2158               && !PURE_SLP_STMT (stmt_info))
2159             ok = vectorizable_live_operation (loop_vinfo, stmt_info, NULL, NULL,
2160                                               -1, false, &cost_vec);
2161
2162           if (!ok)
2163             return opt_result::failure_at (phi,
2164                                            "not vectorized: relevant phi not "
2165                                            "supported: %G",
2166                                            static_cast <gimple *> (phi));
2167         }
2168
2169       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
2170            gsi_next (&si))
2171         {
2172           gimple *stmt = gsi_stmt (si);
2173           if (!gimple_clobber_p (stmt)
2174               && !is_gimple_debug (stmt))
2175             {
2176               opt_result res
2177                 = vect_analyze_stmt (loop_vinfo,
2178                                      loop_vinfo->lookup_stmt (stmt),
2179                                      &need_to_vectorize,
2180                                      NULL, NULL, &cost_vec);
2181               if (!res)
2182                 return res;
2183             }
2184         }
2185     } /* bbs */
2186
2187   add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
2188
2189   /* All operations in the loop are either irrelevant (deal with loop
2190      control, or dead), or only used outside the loop and can be moved
2191      out of the loop (e.g. invariants, inductions).  The loop can be
2192      optimized away by scalar optimizations.  We're better off not
2193      touching this loop.  */
2194   if (!need_to_vectorize)
2195     {
2196       if (dump_enabled_p ())
2197         dump_printf_loc (MSG_NOTE, vect_location,
2198                          "All the computation can be taken out of the loop.\n");
2199       return opt_result::failure_at
2200         (vect_location,
2201          "not vectorized: redundant loop. no profit to vectorize.\n");
2202     }
2203
2204   return opt_result::success ();
2205 }
2206
2207 /* Return true if we know that the iteration count is smaller than the
2208    vectorization factor.  Return false if it isn't, or if we can't be sure
2209    either way.  */
2210
2211 static bool
2212 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
2213 {
2214   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2215
2216   HOST_WIDE_INT max_niter;
2217   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2218     max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
2219   else
2220     max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2221
2222   if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
2223     return true;
2224
2225   return false;
2226 }
2227
2228 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
2229    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
2230    definitely no, or -1 if it's worth retrying.  */
2231
2232 static int
2233 vect_analyze_loop_costing (loop_vec_info loop_vinfo,
2234                            unsigned *suggested_unroll_factor)
2235 {
2236   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2237   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2238
2239   /* Only loops that can handle partially-populated vectors can have iteration
2240      counts less than the vectorization factor.  */
2241   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2242       && vect_known_niters_smaller_than_vf (loop_vinfo))
2243     {
2244       if (dump_enabled_p ())
2245         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2246                          "not vectorized: iteration count smaller than "
2247                          "vectorization factor.\n");
2248       return 0;
2249     }
2250
2251   /* If we know the number of iterations we can do better, for the
2252      epilogue we can also decide whether the main loop leaves us
2253      with enough iterations, prefering a smaller vector epilog then
2254      also possibly used for the case we skip the vector loop.  */
2255   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2256     {
2257       widest_int scalar_niters
2258         = wi::to_widest (LOOP_VINFO_NITERSM1 (loop_vinfo)) + 1;
2259       if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2260         {
2261           loop_vec_info orig_loop_vinfo
2262             = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2263           unsigned lowest_vf
2264             = constant_lower_bound (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo));
2265           int prolog_peeling = 0;
2266           if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2267             prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo);
2268           if (prolog_peeling >= 0
2269               && known_eq (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
2270                            lowest_vf))
2271             {
2272               unsigned gap
2273                 = LOOP_VINFO_PEELING_FOR_GAPS (orig_loop_vinfo) ? 1 : 0;
2274               scalar_niters = ((scalar_niters - gap - prolog_peeling)
2275                                % lowest_vf + gap);
2276             }
2277         }
2278       /* Reject vectorizing for a single scalar iteration, even if
2279          we could in principle implement that using partial vectors.  */
2280       unsigned peeling_gap = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2281       if (scalar_niters <= peeling_gap + 1)
2282         {
2283           if (dump_enabled_p ())
2284             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2285                              "not vectorized: loop only has a single "
2286                              "scalar iteration.\n");
2287           return 0;
2288         }
2289
2290       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2291         {
2292           /* Check that the loop processes at least one full vector.  */
2293           poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2294           if (known_lt (scalar_niters, vf))
2295             {
2296               if (dump_enabled_p ())
2297                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2298                                  "loop does not have enough iterations "
2299                                  "to support vectorization.\n");
2300               return 0;
2301             }
2302
2303           /* If we need to peel an extra epilogue iteration to handle data
2304              accesses with gaps, check that there are enough scalar iterations
2305              available.
2306
2307              The check above is redundant with this one when peeling for gaps,
2308              but the distinction is useful for diagnostics.  */
2309           if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2310               && known_le (scalar_niters, vf))
2311             {
2312               if (dump_enabled_p ())
2313                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2314                                  "loop does not have enough iterations "
2315                                  "to support peeling for gaps.\n");
2316               return 0;
2317             }
2318         }
2319     }
2320
2321   /* If using the "very cheap" model. reject cases in which we'd keep
2322      a copy of the scalar code (even if we might be able to vectorize it).  */
2323   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2324       && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2325           || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2326           || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
2327     {
2328       if (dump_enabled_p ())
2329         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2330                          "some scalar iterations would need to be peeled\n");
2331       return 0;
2332     }
2333
2334   int min_profitable_iters, min_profitable_estimate;
2335   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2336                                       &min_profitable_estimate,
2337                                       suggested_unroll_factor);
2338
2339   if (min_profitable_iters < 0)
2340     {
2341       if (dump_enabled_p ())
2342         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2343                          "not vectorized: vectorization not profitable.\n");
2344       if (dump_enabled_p ())
2345         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2346                          "not vectorized: vector version will never be "
2347                          "profitable.\n");
2348       return -1;
2349     }
2350
2351   int min_scalar_loop_bound = (param_min_vect_loop_bound
2352                                * assumed_vf);
2353
2354   /* Use the cost model only if it is more conservative than user specified
2355      threshold.  */
2356   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
2357                                     min_profitable_iters);
2358
2359   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2360
2361   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2362       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2363     {
2364       if (dump_enabled_p ())
2365         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2366                          "not vectorized: vectorization not profitable.\n");
2367       if (dump_enabled_p ())
2368         dump_printf_loc (MSG_NOTE, vect_location,
2369                          "not vectorized: iteration count smaller than user "
2370                          "specified loop bound parameter or minimum profitable "
2371                          "iterations (whichever is more conservative).\n");
2372       return 0;
2373     }
2374
2375   /* The static profitablity threshold min_profitable_estimate includes
2376      the cost of having to check at runtime whether the scalar loop
2377      should be used instead.  If it turns out that we don't need or want
2378      such a check, the threshold we should use for the static estimate
2379      is simply the point at which the vector loop becomes more profitable
2380      than the scalar loop.  */
2381   if (min_profitable_estimate > min_profitable_iters
2382       && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
2383       && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2384       && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2385       && !vect_apply_runtime_profitability_check_p (loop_vinfo))
2386     {
2387       if (dump_enabled_p ())
2388         dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
2389                          " choice between the scalar and vector loops\n");
2390       min_profitable_estimate = min_profitable_iters;
2391     }
2392
2393   /* If the vector loop needs multiple iterations to be beneficial then
2394      things are probably too close to call, and the conservative thing
2395      would be to stick with the scalar code.  */
2396   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2397       && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
2398     {
2399       if (dump_enabled_p ())
2400         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2401                          "one iteration of the vector loop would be"
2402                          " more expensive than the equivalent number of"
2403                          " iterations of the scalar loop\n");
2404       return 0;
2405     }
2406
2407   HOST_WIDE_INT estimated_niter;
2408
2409   /* If we are vectorizing an epilogue then we know the maximum number of
2410      scalar iterations it will cover is at least one lower than the
2411      vectorization factor of the main loop.  */
2412   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2413     estimated_niter
2414       = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
2415   else
2416     {
2417       estimated_niter = estimated_stmt_executions_int (loop);
2418       if (estimated_niter == -1)
2419         estimated_niter = likely_max_stmt_executions_int (loop);
2420     }
2421   if (estimated_niter != -1
2422       && ((unsigned HOST_WIDE_INT) estimated_niter
2423           < MAX (th, (unsigned) min_profitable_estimate)))
2424     {
2425       if (dump_enabled_p ())
2426         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2427                          "not vectorized: estimated iteration count too "
2428                          "small.\n");
2429       if (dump_enabled_p ())
2430         dump_printf_loc (MSG_NOTE, vect_location,
2431                          "not vectorized: estimated iteration count smaller "
2432                          "than specified loop bound parameter or minimum "
2433                          "profitable iterations (whichever is more "
2434                          "conservative).\n");
2435       return -1;
2436     }
2437
2438   return 1;
2439 }
2440
2441 static opt_result
2442 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
2443                            vec<data_reference_p> *datarefs,
2444                            unsigned int *n_stmts)
2445 {
2446   *n_stmts = 0;
2447   for (unsigned i = 0; i < loop->num_nodes; i++)
2448     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2449          !gsi_end_p (gsi); gsi_next (&gsi))
2450       {
2451         gimple *stmt = gsi_stmt (gsi);
2452         if (is_gimple_debug (stmt))
2453           continue;
2454         ++(*n_stmts);
2455         opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
2456                                                         NULL, 0);
2457         if (!res)
2458           {
2459             if (is_gimple_call (stmt) && loop->safelen)
2460               {
2461                 tree fndecl = gimple_call_fndecl (stmt), op;
2462                 if (fndecl == NULL_TREE
2463                     && gimple_call_internal_p (stmt, IFN_MASK_CALL))
2464                   {
2465                     fndecl = gimple_call_arg (stmt, 0);
2466                     gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
2467                     fndecl = TREE_OPERAND (fndecl, 0);
2468                     gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
2469                   }
2470                 if (fndecl != NULL_TREE)
2471                   {
2472                     cgraph_node *node = cgraph_node::get (fndecl);
2473                     if (node != NULL && node->simd_clones != NULL)
2474                       {
2475                         unsigned int j, n = gimple_call_num_args (stmt);
2476                         for (j = 0; j < n; j++)
2477                           {
2478                             op = gimple_call_arg (stmt, j);
2479                             if (DECL_P (op)
2480                                 || (REFERENCE_CLASS_P (op)
2481                                     && get_base_address (op)))
2482                               break;
2483                           }
2484                         op = gimple_call_lhs (stmt);
2485                         /* Ignore #pragma omp declare simd functions
2486                            if they don't have data references in the
2487                            call stmt itself.  */
2488                         if (j == n
2489                             && !(op
2490                                  && (DECL_P (op)
2491                                      || (REFERENCE_CLASS_P (op)
2492                                          && get_base_address (op)))))
2493                           continue;
2494                       }
2495                   }
2496               }
2497             return res;
2498           }
2499         /* If dependence analysis will give up due to the limit on the
2500            number of datarefs stop here and fail fatally.  */
2501         if (datarefs->length ()
2502             > (unsigned)param_loop_max_datarefs_for_datadeps)
2503           return opt_result::failure_at (stmt, "exceeded param "
2504                                          "loop-max-datarefs-for-datadeps\n");
2505       }
2506   return opt_result::success ();
2507 }
2508
2509 /* Look for SLP-only access groups and turn each individual access into its own
2510    group.  */
2511 static void
2512 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2513 {
2514   unsigned int i;
2515   struct data_reference *dr;
2516
2517   DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2518
2519   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2520   FOR_EACH_VEC_ELT (datarefs, i, dr)
2521     {
2522       gcc_assert (DR_REF (dr));
2523       stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
2524
2525       /* Check if the load is a part of an interleaving chain.  */
2526       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2527         {
2528           stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2529           dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2530           unsigned int group_size = DR_GROUP_SIZE (first_element);
2531
2532           /* Check if SLP-only groups.  */
2533           if (!STMT_SLP_TYPE (stmt_info)
2534               && STMT_VINFO_SLP_VECT_ONLY (first_element))
2535             {
2536               /* Dissolve the group.  */
2537               STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2538
2539               stmt_vec_info vinfo = first_element;
2540               while (vinfo)
2541                 {
2542                   stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2543                   DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2544                   DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2545                   DR_GROUP_SIZE (vinfo) = 1;
2546                   if (STMT_VINFO_STRIDED_P (first_element)
2547                       /* We cannot handle stores with gaps.  */
2548                       || DR_IS_WRITE (dr_info->dr))
2549                     {
2550                       STMT_VINFO_STRIDED_P (vinfo) = true;
2551                       DR_GROUP_GAP (vinfo) = 0;
2552                     }
2553                   else
2554                     DR_GROUP_GAP (vinfo) = group_size - 1;
2555                   /* Duplicate and adjust alignment info, it needs to
2556                      be present on each group leader, see dr_misalignment.  */
2557                   if (vinfo != first_element)
2558                     {
2559                       dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2560                       dr_info2->target_alignment = dr_info->target_alignment;
2561                       int misalignment = dr_info->misalignment;
2562                       if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2563                         {
2564                           HOST_WIDE_INT diff
2565                             = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2566                                - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2567                           unsigned HOST_WIDE_INT align_c
2568                             = dr_info->target_alignment.to_constant ();
2569                           misalignment = (misalignment + diff) % align_c;
2570                         }
2571                       dr_info2->misalignment = misalignment;
2572                     }
2573                   vinfo = next;
2574                 }
2575             }
2576         }
2577     }
2578 }
2579
2580 /* Determine if operating on full vectors for LOOP_VINFO might leave
2581    some scalar iterations still to do.  If so, decide how we should
2582    handle those scalar iterations.  The possibilities are:
2583
2584    (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2585        In this case:
2586
2587          LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2588          LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2589          LOOP_VINFO_PEELING_FOR_NITER == false
2590
2591    (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2592        to handle the remaining scalar iterations.  In this case:
2593
2594          LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2595          LOOP_VINFO_PEELING_FOR_NITER == true
2596
2597        There are two choices:
2598
2599        (2a) Consider vectorizing the epilogue loop at the same VF as the
2600             main loop, but using partial vectors instead of full vectors.
2601             In this case:
2602
2603               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2604
2605        (2b) Consider vectorizing the epilogue loop at lower VFs only.
2606             In this case:
2607
2608               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2609  */
2610
2611 opt_result
2612 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo)
2613 {
2614   /* Determine whether there would be any scalar iterations left over.  */
2615   bool need_peeling_or_partial_vectors_p
2616     = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2617
2618   /* Decide whether to vectorize the loop with partial vectors.  */
2619   LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2620   LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2621   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2622       && need_peeling_or_partial_vectors_p)
2623     {
2624       /* For partial-vector-usage=1, try to push the handling of partial
2625          vectors to the epilogue, with the main loop continuing to operate
2626          on full vectors.
2627
2628          If we are unrolling we also do not want to use partial vectors. This
2629          is to avoid the overhead of generating multiple masks and also to
2630          avoid having to execute entire iterations of FALSE masked instructions
2631          when dealing with one or less full iterations.
2632
2633          ??? We could then end up failing to use partial vectors if we
2634          decide to peel iterations into a prologue, and if the main loop
2635          then ends up processing fewer than VF iterations.  */
2636       if ((param_vect_partial_vector_usage == 1
2637            || loop_vinfo->suggested_unroll_factor > 1)
2638           && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2639           && !vect_known_niters_smaller_than_vf (loop_vinfo))
2640         LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2641       else
2642         LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2643     }
2644
2645   if (dump_enabled_p ())
2646     dump_printf_loc (MSG_NOTE, vect_location,
2647                      "operating on %s vectors%s.\n",
2648                      LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2649                      ? "partial" : "full",
2650                      LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2651                      ? " for epilogue loop" : "");
2652
2653   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2654     = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2655        && need_peeling_or_partial_vectors_p);
2656
2657   return opt_result::success ();
2658 }
2659
2660 /* Function vect_analyze_loop_2.
2661
2662    Apply a set of analyses on LOOP specified by LOOP_VINFO, the different
2663    analyses will record information in some members of LOOP_VINFO.  FATAL
2664    indicates if some analysis meets fatal error.  If one non-NULL pointer
2665    SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one
2666    worked out suggested unroll factor, while one NULL pointer shows it's
2667    going to apply the suggested unroll factor.  SLP_DONE_FOR_SUGGESTED_UF
2668    is to hold the slp decision when the suggested unroll factor is worked
2669    out.  */
2670 static opt_result
2671 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
2672                      unsigned *suggested_unroll_factor,
2673                      bool& slp_done_for_suggested_uf)
2674 {
2675   opt_result ok = opt_result::success ();
2676   int res;
2677   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2678   poly_uint64 min_vf = 2;
2679   loop_vec_info orig_loop_vinfo = NULL;
2680
2681   /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2682      loop_vec_info of the first vectorized loop.  */
2683   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2684     orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2685   else
2686     orig_loop_vinfo = loop_vinfo;
2687   gcc_assert (orig_loop_vinfo);
2688
2689   /* The first group of checks is independent of the vector size.  */
2690   fatal = true;
2691
2692   if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2693       && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2694     return opt_result::failure_at (vect_location,
2695                                    "not vectorized: simd if(0)\n");
2696
2697   /* Find all data references in the loop (which correspond to vdefs/vuses)
2698      and analyze their evolution in the loop.  */
2699
2700   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2701
2702   /* Gather the data references and count stmts in the loop.  */
2703   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2704     {
2705       opt_result res
2706         = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2707                                      &LOOP_VINFO_DATAREFS (loop_vinfo),
2708                                      &LOOP_VINFO_N_STMTS (loop_vinfo));
2709       if (!res)
2710         {
2711           if (dump_enabled_p ())
2712             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2713                              "not vectorized: loop contains function "
2714                              "calls or data references that cannot "
2715                              "be analyzed\n");
2716           return res;
2717         }
2718       loop_vinfo->shared->save_datarefs ();
2719     }
2720   else
2721     loop_vinfo->shared->check_datarefs ();
2722
2723   /* Analyze the data references and also adjust the minimal
2724      vectorization factor according to the loads and stores.  */
2725
2726   ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2727   if (!ok)
2728     {
2729       if (dump_enabled_p ())
2730         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2731                          "bad data references.\n");
2732       return ok;
2733     }
2734
2735   /* Check if we are applying unroll factor now.  */
2736   bool applying_suggested_uf = loop_vinfo->suggested_unroll_factor > 1;
2737   gcc_assert (!applying_suggested_uf || !suggested_unroll_factor);
2738
2739   /* If the slp decision is false when suggested unroll factor is worked
2740      out, and we are applying suggested unroll factor, we can simply skip
2741      all slp related analyses this time.  */
2742   bool slp = !applying_suggested_uf || slp_done_for_suggested_uf;
2743
2744   /* Classify all cross-iteration scalar data-flow cycles.
2745      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
2746   vect_analyze_scalar_cycles (loop_vinfo, slp);
2747
2748   vect_pattern_recog (loop_vinfo);
2749
2750   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2751
2752   /* Analyze the access patterns of the data-refs in the loop (consecutive,
2753      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
2754
2755   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2756   if (!ok)
2757     {
2758       if (dump_enabled_p ())
2759         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2760                          "bad data access.\n");
2761       return ok;
2762     }
2763
2764   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
2765
2766   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2767   if (!ok)
2768     {
2769       if (dump_enabled_p ())
2770         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2771                          "unexpected pattern.\n");
2772       return ok;
2773     }
2774
2775   /* While the rest of the analysis below depends on it in some way.  */
2776   fatal = false;
2777
2778   /* Analyze data dependences between the data-refs in the loop
2779      and adjust the maximum vectorization factor according to
2780      the dependences.
2781      FORNOW: fail at the first data dependence that we encounter.  */
2782
2783   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2784   if (!ok)
2785     {
2786       if (dump_enabled_p ())
2787         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2788                          "bad data dependence.\n");
2789       return ok;
2790     }
2791   if (max_vf != MAX_VECTORIZATION_FACTOR
2792       && maybe_lt (max_vf, min_vf))
2793     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2794   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2795
2796   ok = vect_determine_vectorization_factor (loop_vinfo);
2797   if (!ok)
2798     {
2799       if (dump_enabled_p ())
2800         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2801                          "can't determine vectorization factor.\n");
2802       return ok;
2803     }
2804   if (max_vf != MAX_VECTORIZATION_FACTOR
2805       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2806     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2807
2808   /* Compute the scalar iteration cost.  */
2809   vect_compute_single_scalar_iteration_cost (loop_vinfo);
2810
2811   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2812
2813   if (slp)
2814     {
2815       /* Check the SLP opportunities in the loop, analyze and build
2816          SLP trees.  */
2817       ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo));
2818       if (!ok)
2819         return ok;
2820
2821       /* If there are any SLP instances mark them as pure_slp.  */
2822       slp = vect_make_slp_decision (loop_vinfo);
2823       if (slp)
2824         {
2825           /* Find stmts that need to be both vectorized and SLPed.  */
2826           vect_detect_hybrid_slp (loop_vinfo);
2827
2828           /* Update the vectorization factor based on the SLP decision.  */
2829           vect_update_vf_for_slp (loop_vinfo);
2830
2831           /* Optimize the SLP graph with the vectorization factor fixed.  */
2832           vect_optimize_slp (loop_vinfo);
2833
2834           /* Gather the loads reachable from the SLP graph entries.  */
2835           vect_gather_slp_loads (loop_vinfo);
2836         }
2837     }
2838
2839   bool saved_can_use_partial_vectors_p
2840     = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2841
2842   /* We don't expect to have to roll back to anything other than an empty
2843      set of rgroups.  */
2844   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2845
2846   /* This is the point where we can re-start analysis with SLP forced off.  */
2847 start_over:
2848
2849   /* Apply the suggested unrolling factor, this was determined by the backend
2850      during finish_cost the first time we ran the analyzis for this
2851      vector mode.  */
2852   if (applying_suggested_uf)
2853     LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2854
2855   /* Now the vectorization factor is final.  */
2856   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2857   gcc_assert (known_ne (vectorization_factor, 0U));
2858
2859   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2860     {
2861       dump_printf_loc (MSG_NOTE, vect_location,
2862                        "vectorization_factor = ");
2863       dump_dec (MSG_NOTE, vectorization_factor);
2864       dump_printf (MSG_NOTE, ", niters = %wd\n",
2865                    LOOP_VINFO_INT_NITERS (loop_vinfo));
2866     }
2867
2868   loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2869
2870   /* Analyze the alignment of the data-refs in the loop.
2871      Fail if a data reference is found that cannot be vectorized.  */
2872
2873   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2874   if (!ok)
2875     {
2876       if (dump_enabled_p ())
2877         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2878                          "bad data alignment.\n");
2879       return ok;
2880     }
2881
2882   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2883      It is important to call pruning after vect_analyze_data_ref_accesses,
2884      since we use grouping information gathered by interleaving analysis.  */
2885   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2886   if (!ok)
2887     return ok;
2888
2889   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2890      vectorization, since we do not want to add extra peeling or
2891      add versioning for alignment.  */
2892   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2893     /* This pass will decide on using loop versioning and/or loop peeling in
2894        order to enhance the alignment of data references in the loop.  */
2895     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2896   if (!ok)
2897     return ok;
2898
2899   if (slp)
2900     {
2901       /* Analyze operations in the SLP instances.  Note this may
2902          remove unsupported SLP instances which makes the above
2903          SLP kind detection invalid.  */
2904       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2905       vect_slp_analyze_operations (loop_vinfo);
2906       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2907         {
2908           ok = opt_result::failure_at (vect_location,
2909                                        "unsupported SLP instances\n");
2910           goto again;
2911         }
2912
2913       /* Check whether any load in ALL SLP instances is possibly permuted.  */
2914       slp_tree load_node, slp_root;
2915       unsigned i, x;
2916       slp_instance instance;
2917       bool can_use_lanes = true;
2918       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2919         {
2920           slp_root = SLP_INSTANCE_TREE (instance);
2921           int group_size = SLP_TREE_LANES (slp_root);
2922           tree vectype = SLP_TREE_VECTYPE (slp_root);
2923           bool loads_permuted = false;
2924           FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2925             {
2926               if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2927                 continue;
2928               unsigned j;
2929               stmt_vec_info load_info;
2930               FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2931                 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2932                   {
2933                     loads_permuted = true;
2934                     break;
2935                   }
2936             }
2937
2938           /* If the loads and stores can be handled with load/store-lane
2939              instructions record it and move on to the next instance.  */
2940           if (loads_permuted
2941               && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2942               && vect_store_lanes_supported (vectype, group_size, false)
2943                    != IFN_LAST)
2944             {
2945               FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2946                 {
2947                   stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2948                       (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2949                   /* Use SLP for strided accesses (or if we can't
2950                      load-lanes).  */
2951                   if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2952                       || vect_load_lanes_supported
2953                             (STMT_VINFO_VECTYPE (stmt_vinfo),
2954                              DR_GROUP_SIZE (stmt_vinfo), false) == IFN_LAST)
2955                     break;
2956                 }
2957
2958               can_use_lanes
2959                 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
2960
2961               if (can_use_lanes && dump_enabled_p ())
2962                 dump_printf_loc (MSG_NOTE, vect_location,
2963                                  "SLP instance %p can use load/store-lanes\n",
2964                                  (void *) instance);
2965             }
2966           else
2967             {
2968               can_use_lanes = false;
2969               break;
2970             }
2971         }
2972
2973       /* If all SLP instances can use load/store-lanes abort SLP and try again
2974          with SLP disabled.  */
2975       if (can_use_lanes)
2976         {
2977           ok = opt_result::failure_at (vect_location,
2978                                        "Built SLP cancelled: can use "
2979                                        "load/store-lanes\n");
2980           if (dump_enabled_p ())
2981             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2982                              "Built SLP cancelled: all SLP instances support "
2983                              "load/store-lanes\n");
2984           goto again;
2985         }
2986     }
2987
2988   /* Dissolve SLP-only groups.  */
2989   vect_dissolve_slp_only_groups (loop_vinfo);
2990
2991   /* Scan all the remaining operations in the loop that are not subject
2992      to SLP and make sure they are vectorizable.  */
2993   ok = vect_analyze_loop_operations (loop_vinfo);
2994   if (!ok)
2995     {
2996       if (dump_enabled_p ())
2997         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2998                          "bad operation or unsupported loop bound.\n");
2999       return ok;
3000     }
3001
3002   /* For now, we don't expect to mix both masking and length approaches for one
3003      loop, disable it if both are recorded.  */
3004   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3005       && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
3006       && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
3007     {
3008       if (dump_enabled_p ())
3009         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3010                          "can't vectorize a loop with partial vectors"
3011                          " because we don't expect to mix different"
3012                          " approaches with partial vectors for the"
3013                          " same loop.\n");
3014       LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3015     }
3016
3017   /* If we still have the option of using partial vectors,
3018      check whether we can generate the necessary loop controls.  */
3019   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
3020     {
3021       if (!LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
3022         {
3023           if (!vect_verify_full_masking (loop_vinfo)
3024               && !vect_verify_full_masking_avx512 (loop_vinfo))
3025             LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3026         }
3027       else /* !LOOP_VINFO_LENS (loop_vinfo).is_empty () */
3028         if (!vect_verify_loop_lens (loop_vinfo))
3029           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3030     }
3031
3032   /* If we're vectorizing a loop that uses length "controls" and
3033      can iterate more than once, we apply decrementing IV approach
3034      in loop control.  */
3035   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3036       && LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_len
3037       && LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0
3038       && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3039            && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
3040                         LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
3041     LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
3042
3043   /* If a loop uses length controls and has a decrementing loop control IV,
3044      we will normally pass that IV through a MIN_EXPR to calcaluate the
3045      basis for the length controls.  E.g. in a loop that processes one
3046      element per scalar iteration, the number of elements would be
3047      MIN_EXPR <N, VF>, where N is the number of scalar iterations left.
3048
3049      This MIN_EXPR approach allows us to use pointer IVs with an invariant
3050      step, since only the final iteration of the vector loop can have
3051      inactive lanes.
3052
3053      However, some targets have a dedicated instruction for calculating the
3054      preferred length, given the total number of elements that still need to
3055      be processed.  This is encapsulated in the SELECT_VL internal function.
3056
3057      If the target supports SELECT_VL, we can use it instead of MIN_EXPR
3058      to determine the basis for the length controls.  However, unlike the
3059      MIN_EXPR calculation, the SELECT_VL calculation can decide to make
3060      lanes inactive in any iteration of the vector loop, not just the last
3061      iteration.  This SELECT_VL approach therefore requires us to use pointer
3062      IVs with variable steps.
3063
3064      Once we've decided how many elements should be processed by one
3065      iteration of the vector loop, we need to populate the rgroup controls.
3066      If a loop has multiple rgroups, we need to make sure that those rgroups
3067      "line up" (that is, they must be consistent about which elements are
3068      active and which aren't).  This is done by vect_adjust_loop_lens_control.
3069
3070      In principle, it would be possible to use vect_adjust_loop_lens_control
3071      on either the result of a MIN_EXPR or the result of a SELECT_VL.
3072      However:
3073
3074      (1) In practice, it only makes sense to use SELECT_VL when a vector
3075          operation will be controlled directly by the result.  It is not
3076          worth using SELECT_VL if it would only be the input to other
3077          calculations.
3078
3079      (2) If we use SELECT_VL for an rgroup that has N controls, each associated
3080          pointer IV will need N updates by a variable amount (N-1 updates
3081          within the iteration and 1 update to move to the next iteration).
3082
3083      Because of this, we prefer to use the MIN_EXPR approach whenever there
3084      is more than one length control.
3085
3086      In addition, SELECT_VL always operates to a granularity of 1 unit.
3087      If we wanted to use it to control an SLP operation on N consecutive
3088      elements, we would need to make the SELECT_VL inputs measure scalar
3089      iterations (rather than elements) and then multiply the SELECT_VL
3090      result by N.  But using SELECT_VL this way is inefficient because
3091      of (1) above.
3092
3093      2. We don't apply SELECT_VL on single-rgroup when both (1) and (2) are
3094         satisfied:
3095
3096      (1). LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) is true.
3097      (2). LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () is true.
3098
3099      Since SELECT_VL (variable step) will make SCEV analysis failed and then
3100      we will fail to gain benefits of following unroll optimizations. We prefer
3101      using the MIN_EXPR approach in this situation.  */
3102   if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
3103     {
3104       tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
3105       if (direct_internal_fn_supported_p (IFN_SELECT_VL, iv_type,
3106                                           OPTIMIZE_FOR_SPEED)
3107           && LOOP_VINFO_LENS (loop_vinfo).length () == 1
3108           && LOOP_VINFO_LENS (loop_vinfo)[0].factor == 1 && !slp
3109           && (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3110               || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()))
3111         LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true;
3112     }
3113
3114   /* Decide whether this loop_vinfo should use partial vectors or peeling,
3115      assuming that the loop will be used as a main loop.  We will redo
3116      this analysis later if we instead decide to use the loop as an
3117      epilogue loop.  */
3118   ok = vect_determine_partial_vectors_and_peeling (loop_vinfo);
3119   if (!ok)
3120     return ok;
3121
3122   /* If we're vectorizing an epilogue loop, the vectorized loop either needs
3123      to be able to handle fewer than VF scalars, or needs to have a lower VF
3124      than the main loop.  */
3125   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
3126       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3127     {
3128       poly_uint64 unscaled_vf
3129         = exact_div (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
3130                      orig_loop_vinfo->suggested_unroll_factor);
3131       if (maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo), unscaled_vf))
3132         return opt_result::failure_at (vect_location,
3133                                        "Vectorization factor too high for"
3134                                        " epilogue loop.\n");
3135     }
3136
3137   /* Check the costings of the loop make vectorizing worthwhile.  */
3138   res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
3139   if (res < 0)
3140     {
3141       ok = opt_result::failure_at (vect_location,
3142                                    "Loop costings may not be worthwhile.\n");
3143       goto again;
3144     }
3145   if (!res)
3146     return opt_result::failure_at (vect_location,
3147                                    "Loop costings not worthwhile.\n");
3148
3149   /* If an epilogue loop is required make sure we can create one.  */
3150   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
3151       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
3152     {
3153       if (dump_enabled_p ())
3154         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
3155       if (!vect_can_advance_ivs_p (loop_vinfo)
3156           || !slpeel_can_duplicate_loop_p (loop,
3157                                            LOOP_VINFO_IV_EXIT (loop_vinfo),
3158                                            LOOP_VINFO_IV_EXIT (loop_vinfo)))
3159         {
3160           ok = opt_result::failure_at (vect_location,
3161                                        "not vectorized: can't create required "
3162                                        "epilog loop\n");
3163           goto again;
3164         }
3165     }
3166
3167   /* During peeling, we need to check if number of loop iterations is
3168      enough for both peeled prolog loop and vector loop.  This check
3169      can be merged along with threshold check of loop versioning, so
3170      increase threshold for this case if necessary.
3171
3172      If we are analyzing an epilogue we still want to check what its
3173      versioning threshold would be.  If we decide to vectorize the epilogues we
3174      will want to use the lowest versioning threshold of all epilogues and main
3175      loop.  This will enable us to enter a vectorized epilogue even when
3176      versioning the loop.  We can't simply check whether the epilogue requires
3177      versioning though since we may have skipped some versioning checks when
3178      analyzing the epilogue.  For instance, checks for alias versioning will be
3179      skipped when dealing with epilogues as we assume we already checked them
3180      for the main loop.  So instead we always check the 'orig_loop_vinfo'.  */
3181   if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
3182     {
3183       poly_uint64 niters_th = 0;
3184       unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3185
3186       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
3187         {
3188           /* Niters for peeled prolog loop.  */
3189           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3190             {
3191               dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
3192               tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
3193               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
3194             }
3195           else
3196             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3197         }
3198
3199       /* Niters for at least one iteration of vectorized loop.  */
3200       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3201         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3202       /* One additional iteration because of peeling for gap.  */
3203       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3204         niters_th += 1;
3205
3206       /*  Use the same condition as vect_transform_loop to decide when to use
3207           the cost to determine a versioning threshold.  */
3208       if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3209           && ordered_p (th, niters_th))
3210         niters_th = ordered_max (poly_uint64 (th), niters_th);
3211
3212       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
3213     }
3214
3215   gcc_assert (known_eq (vectorization_factor,
3216                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
3217
3218   slp_done_for_suggested_uf = slp;
3219
3220   /* Ok to vectorize!  */
3221   LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
3222   return opt_result::success ();
3223
3224 again:
3225   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
3226   gcc_assert (!ok);
3227
3228   /* Try again with SLP forced off but if we didn't do any SLP there is
3229      no point in re-trying.  */
3230   if (!slp)
3231     return ok;
3232
3233   /* If the slp decision is true when suggested unroll factor is worked
3234      out, and we are applying suggested unroll factor, we don't need to
3235      re-try any more.  */
3236   if (applying_suggested_uf && slp_done_for_suggested_uf)
3237     return ok;
3238
3239   /* If there are reduction chains re-trying will fail anyway.  */
3240   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
3241     return ok;
3242
3243   /* Likewise if the grouped loads or stores in the SLP cannot be handled
3244      via interleaving or lane instructions.  */
3245   slp_instance instance;
3246   slp_tree node;
3247   unsigned i, j;
3248   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
3249     {
3250       stmt_vec_info vinfo;
3251       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
3252       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
3253         continue;
3254       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3255       unsigned int size = DR_GROUP_SIZE (vinfo);
3256       tree vectype = STMT_VINFO_VECTYPE (vinfo);
3257       if (vect_store_lanes_supported (vectype, size, false) == IFN_LAST
3258          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
3259          && ! vect_grouped_store_supported (vectype, size))
3260         return opt_result::failure_at (vinfo->stmt,
3261                                        "unsupported grouped store\n");
3262       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
3263         {
3264           vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
3265           vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3266           bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
3267           size = DR_GROUP_SIZE (vinfo);
3268           vectype = STMT_VINFO_VECTYPE (vinfo);
3269           if (vect_load_lanes_supported (vectype, size, false) == IFN_LAST
3270               && ! vect_grouped_load_supported (vectype, single_element_p,
3271                                                 size))
3272             return opt_result::failure_at (vinfo->stmt,
3273                                            "unsupported grouped load\n");
3274         }
3275     }
3276
3277   if (dump_enabled_p ())
3278     dump_printf_loc (MSG_NOTE, vect_location,
3279                      "re-trying with SLP disabled\n");
3280
3281   /* Roll back state appropriately.  No SLP this time.  */
3282   slp = false;
3283   /* Restore vectorization factor as it were without SLP.  */
3284   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
3285   /* Free the SLP instances.  */
3286   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
3287     vect_free_slp_instance (instance);
3288   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
3289   /* Reset SLP type to loop_vect on all stmts.  */
3290   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
3291     {
3292       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3293       for (gimple_stmt_iterator si = gsi_start_phis (bb);
3294            !gsi_end_p (si); gsi_next (&si))
3295         {
3296           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3297           STMT_SLP_TYPE (stmt_info) = loop_vect;
3298           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
3299               || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
3300             {
3301               /* vectorizable_reduction adjusts reduction stmt def-types,
3302                  restore them to that of the PHI.  */
3303               STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
3304                 = STMT_VINFO_DEF_TYPE (stmt_info);
3305               STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
3306                                         (STMT_VINFO_REDUC_DEF (stmt_info)))
3307                 = STMT_VINFO_DEF_TYPE (stmt_info);
3308             }
3309         }
3310       for (gimple_stmt_iterator si = gsi_start_bb (bb);
3311            !gsi_end_p (si); gsi_next (&si))
3312         {
3313           if (is_gimple_debug (gsi_stmt (si)))
3314             continue;
3315           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3316           STMT_SLP_TYPE (stmt_info) = loop_vect;
3317           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
3318             {
3319               stmt_vec_info pattern_stmt_info
3320                 = STMT_VINFO_RELATED_STMT (stmt_info);
3321               if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
3322                 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
3323
3324               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
3325               STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
3326               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
3327                    !gsi_end_p (pi); gsi_next (&pi))
3328                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
3329                   = loop_vect;
3330             }
3331         }
3332     }
3333   /* Free optimized alias test DDRS.  */
3334   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
3335   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
3336   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
3337   /* Reset target cost data.  */
3338   delete loop_vinfo->vector_costs;
3339   loop_vinfo->vector_costs = nullptr;
3340   /* Reset accumulated rgroup information.  */
3341   LOOP_VINFO_MASKS (loop_vinfo).mask_set.empty ();
3342   release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
3343   release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
3344   /* Reset assorted flags.  */
3345   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
3346   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
3347   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
3348   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
3349   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3350     = saved_can_use_partial_vectors_p;
3351
3352   goto start_over;
3353 }
3354
3355 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
3356    to be better than vectorizing it using OLD_LOOP_VINFO.  Assume that
3357    OLD_LOOP_VINFO is better unless something specifically indicates
3358    otherwise.
3359
3360    Note that this deliberately isn't a partial order.  */
3361
3362 static bool
3363 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
3364                           loop_vec_info old_loop_vinfo)
3365 {
3366   struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
3367   gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
3368
3369   poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
3370   poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
3371
3372   /* Always prefer a VF of loop->simdlen over any other VF.  */
3373   if (loop->simdlen)
3374     {
3375       bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
3376       bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
3377       if (new_simdlen_p != old_simdlen_p)
3378         return new_simdlen_p;
3379     }
3380
3381   const auto *old_costs = old_loop_vinfo->vector_costs;
3382   const auto *new_costs = new_loop_vinfo->vector_costs;
3383   if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
3384     return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
3385
3386   return new_costs->better_main_loop_than_p (old_costs);
3387 }
3388
3389 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO.  Return
3390    true if we should.  */
3391
3392 static bool
3393 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
3394                         loop_vec_info old_loop_vinfo)
3395 {
3396   if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
3397     return false;
3398
3399   if (dump_enabled_p ())
3400     dump_printf_loc (MSG_NOTE, vect_location,
3401                      "***** Preferring vector mode %s to vector mode %s\n",
3402                      GET_MODE_NAME (new_loop_vinfo->vector_mode),
3403                      GET_MODE_NAME (old_loop_vinfo->vector_mode));
3404   return true;
3405 }
3406
3407 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
3408    not NULL.  Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
3409    MODE_I to the next mode useful to analyze.
3410    Return the loop_vinfo on success and wrapped null on failure.  */
3411
3412 static opt_loop_vec_info
3413 vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
3414                      const vect_loop_form_info *loop_form_info,
3415                      loop_vec_info main_loop_vinfo,
3416                      const vector_modes &vector_modes, unsigned &mode_i,
3417                      machine_mode &autodetected_vector_mode,
3418                      bool &fatal)
3419 {
3420   loop_vec_info loop_vinfo
3421     = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3422
3423   machine_mode vector_mode = vector_modes[mode_i];
3424   loop_vinfo->vector_mode = vector_mode;
3425   unsigned int suggested_unroll_factor = 1;
3426   bool slp_done_for_suggested_uf = false;
3427
3428   /* Run the main analysis.  */
3429   opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,
3430                                         &suggested_unroll_factor,
3431                                         slp_done_for_suggested_uf);
3432   if (dump_enabled_p ())
3433     dump_printf_loc (MSG_NOTE, vect_location,
3434                      "***** Analysis %s with vector mode %s\n",
3435                      res ? "succeeded" : " failed",
3436                      GET_MODE_NAME (loop_vinfo->vector_mode));
3437
3438   if (res && !main_loop_vinfo && suggested_unroll_factor > 1)
3439     {
3440       if (dump_enabled_p ())
3441         dump_printf_loc (MSG_NOTE, vect_location,
3442                          "***** Re-trying analysis for unrolling"
3443                          " with unroll factor %d and slp %s.\n",
3444                          suggested_unroll_factor,
3445                          slp_done_for_suggested_uf ? "on" : "off");
3446       loop_vec_info unroll_vinfo
3447         = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3448       unroll_vinfo->vector_mode = vector_mode;
3449       unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
3450       opt_result new_res = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL,
3451                                                 slp_done_for_suggested_uf);
3452       if (new_res)
3453         {
3454           delete loop_vinfo;
3455           loop_vinfo = unroll_vinfo;
3456         }
3457       else
3458         delete unroll_vinfo;
3459     }
3460
3461   /* Remember the autodetected vector mode.  */
3462   if (vector_mode == VOIDmode)
3463     autodetected_vector_mode = loop_vinfo->vector_mode;
3464
3465   /* Advance mode_i, first skipping modes that would result in the
3466      same analysis result.  */
3467   while (mode_i + 1 < vector_modes.length ()
3468          && vect_chooses_same_modes_p (loop_vinfo,
3469                                        vector_modes[mode_i + 1]))
3470     {
3471       if (dump_enabled_p ())
3472         dump_printf_loc (MSG_NOTE, vect_location,
3473                          "***** The result for vector mode %s would"
3474                          " be the same\n",
3475                          GET_MODE_NAME (vector_modes[mode_i + 1]));
3476       mode_i += 1;
3477     }
3478   if (mode_i + 1 < vector_modes.length ()
3479       && VECTOR_MODE_P (autodetected_vector_mode)
3480       && (related_vector_mode (vector_modes[mode_i + 1],
3481                                GET_MODE_INNER (autodetected_vector_mode))
3482           == autodetected_vector_mode)
3483       && (related_vector_mode (autodetected_vector_mode,
3484                                GET_MODE_INNER (vector_modes[mode_i + 1]))
3485           == vector_modes[mode_i + 1]))
3486     {
3487       if (dump_enabled_p ())
3488         dump_printf_loc (MSG_NOTE, vect_location,
3489                          "***** Skipping vector mode %s, which would"
3490                          " repeat the analysis for %s\n",
3491                          GET_MODE_NAME (vector_modes[mode_i + 1]),
3492                          GET_MODE_NAME (autodetected_vector_mode));
3493       mode_i += 1;
3494     }
3495   mode_i++;
3496
3497   if (!res)
3498     {
3499       delete loop_vinfo;
3500       if (fatal)
3501         gcc_checking_assert (main_loop_vinfo == NULL);
3502       return opt_loop_vec_info::propagate_failure (res);
3503     }
3504
3505   return opt_loop_vec_info::success (loop_vinfo);
3506 }
3507
3508 /* Function vect_analyze_loop.
3509
3510    Apply a set of analyses on LOOP, and create a loop_vec_info struct
3511    for it.  The different analyses will record information in the
3512    loop_vec_info struct.  */
3513 opt_loop_vec_info
3514 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
3515 {
3516   DUMP_VECT_SCOPE ("analyze_loop_nest");
3517
3518   if (loop_outer (loop)
3519       && loop_vec_info_for_loop (loop_outer (loop))
3520       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
3521     return opt_loop_vec_info::failure_at (vect_location,
3522                                           "outer-loop already vectorized.\n");
3523
3524   if (!find_loop_nest (loop, &shared->loop_nest))
3525     return opt_loop_vec_info::failure_at
3526       (vect_location,
3527        "not vectorized: loop nest containing two or more consecutive inner"
3528        " loops cannot be vectorized\n");
3529
3530   /* Analyze the loop form.  */
3531   vect_loop_form_info loop_form_info;
3532   opt_result res = vect_analyze_loop_form (loop, &loop_form_info);
3533   if (!res)
3534     {
3535       if (dump_enabled_p ())
3536         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3537                          "bad loop form.\n");
3538       return opt_loop_vec_info::propagate_failure (res);
3539     }
3540   if (!integer_onep (loop_form_info.assumptions))
3541     {
3542       /* We consider to vectorize this loop by versioning it under
3543          some assumptions.  In order to do this, we need to clear
3544          existing information computed by scev and niter analyzer.  */
3545       scev_reset_htab ();
3546       free_numbers_of_iterations_estimates (loop);
3547       /* Also set flag for this loop so that following scev and niter
3548          analysis are done under the assumptions.  */
3549       loop_constraint_set (loop, LOOP_C_FINITE);
3550     }
3551
3552   auto_vector_modes vector_modes;
3553   /* Autodetect first vector size we try.  */
3554   vector_modes.safe_push (VOIDmode);
3555   unsigned int autovec_flags
3556     = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
3557                                                     loop->simdlen != 0);
3558   bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
3559                              && !unlimited_cost_model (loop));
3560   machine_mode autodetected_vector_mode = VOIDmode;
3561   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
3562   unsigned int mode_i = 0;
3563   unsigned HOST_WIDE_INT simdlen = loop->simdlen;
3564
3565   /* Keep track of the VF for each mode.  Initialize all to 0 which indicates
3566      a mode has not been analyzed.  */
3567   auto_vec<poly_uint64, 8> cached_vf_per_mode;
3568   for (unsigned i = 0; i < vector_modes.length (); ++i)
3569     cached_vf_per_mode.safe_push (0);
3570
3571   /* First determine the main loop vectorization mode, either the first
3572      one that works, starting with auto-detecting the vector mode and then
3573      following the targets order of preference, or the one with the
3574      lowest cost if pick_lowest_cost_p.  */
3575   while (1)
3576     {
3577       bool fatal;
3578       unsigned int last_mode_i = mode_i;
3579       /* Set cached VF to -1 prior to analysis, which indicates a mode has
3580          failed.  */
3581       cached_vf_per_mode[last_mode_i] = -1;
3582       opt_loop_vec_info loop_vinfo
3583         = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3584                                NULL, vector_modes, mode_i,
3585                                autodetected_vector_mode, fatal);
3586       if (fatal)
3587         break;
3588
3589       if (loop_vinfo)
3590         {
3591           /*  Analyzis has been successful so update the VF value.  The
3592               VF should always be a multiple of unroll_factor and we want to
3593               capture the original VF here.  */
3594           cached_vf_per_mode[last_mode_i]
3595             = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
3596                          loop_vinfo->suggested_unroll_factor);
3597           /* Once we hit the desired simdlen for the first time,
3598              discard any previous attempts.  */
3599           if (simdlen
3600               && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3601             {
3602               delete first_loop_vinfo;
3603               first_loop_vinfo = opt_loop_vec_info::success (NULL);
3604               simdlen = 0;
3605             }
3606           else if (pick_lowest_cost_p
3607                    && first_loop_vinfo
3608                    && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3609             {
3610               /* Pick loop_vinfo over first_loop_vinfo.  */
3611               delete first_loop_vinfo;
3612               first_loop_vinfo = opt_loop_vec_info::success (NULL);
3613             }
3614           if (first_loop_vinfo == NULL)
3615             first_loop_vinfo = loop_vinfo;
3616           else
3617             {
3618               delete loop_vinfo;
3619               loop_vinfo = opt_loop_vec_info::success (NULL);
3620             }
3621
3622           /* Commit to first_loop_vinfo if we have no reason to try
3623              alternatives.  */
3624           if (!simdlen && !pick_lowest_cost_p)
3625             break;
3626         }
3627       if (mode_i == vector_modes.length ()
3628           || autodetected_vector_mode == VOIDmode)
3629         break;
3630
3631       /* Try the next biggest vector size.  */
3632       if (dump_enabled_p ())
3633         dump_printf_loc (MSG_NOTE, vect_location,
3634                          "***** Re-trying analysis with vector mode %s\n",
3635                          GET_MODE_NAME (vector_modes[mode_i]));
3636     }
3637   if (!first_loop_vinfo)
3638     return opt_loop_vec_info::propagate_failure (res);
3639
3640   if (dump_enabled_p ())
3641     dump_printf_loc (MSG_NOTE, vect_location,
3642                      "***** Choosing vector mode %s\n",
3643                      GET_MODE_NAME (first_loop_vinfo->vector_mode));
3644
3645   /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3646      enabled, SIMDUID is not set, it is the innermost loop and we have
3647      either already found the loop's SIMDLEN or there was no SIMDLEN to
3648      begin with.
3649      TODO: Enable epilogue vectorization for loops with SIMDUID set.  */
3650   bool vect_epilogues = (!simdlen
3651                          && loop->inner == NULL
3652                          && param_vect_epilogues_nomask
3653                          && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3654                          && !loop->simduid);
3655   if (!vect_epilogues)
3656     return first_loop_vinfo;
3657
3658   /* Now analyze first_loop_vinfo for epilogue vectorization.  */
3659   poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3660
3661   /* For epilogues start the analysis from the first mode.  The motivation
3662      behind starting from the beginning comes from cases where the VECTOR_MODES
3663      array may contain length-agnostic and length-specific modes.  Their
3664      ordering is not guaranteed, so we could end up picking a mode for the main
3665      loop that is after the epilogue's optimal mode.  */
3666   vector_modes[0] = autodetected_vector_mode;
3667   mode_i = 0;
3668
3669   bool supports_partial_vectors =
3670     partial_vectors_supported_p () && param_vect_partial_vector_usage != 0;
3671   poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3672
3673   while (1)
3674     {
3675       /* If the target does not support partial vectors we can shorten the
3676          number of modes to analyze for the epilogue as we know we can't pick a
3677          mode that would lead to a VF at least as big as the
3678          FIRST_VINFO_VF.  */
3679       if (!supports_partial_vectors
3680           && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3681         {
3682           mode_i++;
3683           if (mode_i == vector_modes.length ())
3684             break;
3685           continue;
3686         }
3687
3688       if (dump_enabled_p ())
3689         dump_printf_loc (MSG_NOTE, vect_location,
3690                          "***** Re-trying epilogue analysis with vector "
3691                          "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3692
3693       bool fatal;
3694       opt_loop_vec_info loop_vinfo
3695         = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3696                                first_loop_vinfo,
3697                                vector_modes, mode_i,
3698                                autodetected_vector_mode, fatal);
3699       if (fatal)
3700         break;
3701
3702       if (loop_vinfo)
3703         {
3704           if (pick_lowest_cost_p)
3705             {
3706               /* Keep trying to roll back vectorization attempts while the
3707                  loop_vec_infos they produced were worse than this one.  */
3708               vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3709               while (!vinfos.is_empty ()
3710                      && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3711                 {
3712                   gcc_assert (vect_epilogues);
3713                   delete vinfos.pop ();
3714                 }
3715             }
3716           /* For now only allow one epilogue loop.  */
3717           if (first_loop_vinfo->epilogue_vinfos.is_empty ())
3718             {
3719               first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3720               poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3721               gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3722                           || maybe_ne (lowest_th, 0U));
3723               /* Keep track of the known smallest versioning
3724                  threshold.  */
3725               if (ordered_p (lowest_th, th))
3726                 lowest_th = ordered_min (lowest_th, th);
3727             }
3728           else
3729             {
3730               delete loop_vinfo;
3731               loop_vinfo = opt_loop_vec_info::success (NULL);
3732             }
3733
3734           /* For now only allow one epilogue loop, but allow
3735              pick_lowest_cost_p to replace it, so commit to the
3736              first epilogue if we have no reason to try alternatives.  */
3737           if (!pick_lowest_cost_p)
3738             break;
3739         }
3740
3741       if (mode_i == vector_modes.length ())
3742         break;
3743
3744     }
3745
3746   if (!first_loop_vinfo->epilogue_vinfos.is_empty ())
3747     {
3748       LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3749       if (dump_enabled_p ())
3750         dump_printf_loc (MSG_NOTE, vect_location,
3751                          "***** Choosing epilogue vector mode %s\n",
3752                          GET_MODE_NAME
3753                            (first_loop_vinfo->epilogue_vinfos[0]->vector_mode));
3754     }
3755
3756   return first_loop_vinfo;
3757 }
3758
3759 /* Return true if there is an in-order reduction function for CODE, storing
3760    it in *REDUC_FN if so.  */
3761
3762 static bool
3763 fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3764 {
3765   /* We support MINUS_EXPR by negating the operand.  This also preserves an
3766      initial -0.0 since -0.0 - 0.0 (neutral op for MINUS_EXPR) == -0.0 +
3767      (-0.0) = -0.0.  */
3768   if (code == PLUS_EXPR || code == MINUS_EXPR)
3769     {
3770       *reduc_fn = IFN_FOLD_LEFT_PLUS;
3771       return true;
3772     }
3773   return false;
3774 }
3775
3776 /* Function reduction_fn_for_scalar_code
3777
3778    Input:
3779    CODE - tree_code of a reduction operations.
3780
3781    Output:
3782    REDUC_FN - the corresponding internal function to be used to reduce the
3783       vector of partial results into a single scalar result, or IFN_LAST
3784       if the operation is a supported reduction operation, but does not have
3785       such an internal function.
3786
3787    Return FALSE if CODE currently cannot be vectorized as reduction.  */
3788
3789 bool
3790 reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3791 {
3792   if (code.is_tree_code ())
3793     switch (tree_code (code))
3794       {
3795       case MAX_EXPR:
3796         *reduc_fn = IFN_REDUC_MAX;
3797         return true;
3798
3799       case MIN_EXPR:
3800         *reduc_fn = IFN_REDUC_MIN;
3801         return true;
3802
3803       case PLUS_EXPR:
3804         *reduc_fn = IFN_REDUC_PLUS;
3805         return true;
3806
3807       case BIT_AND_EXPR:
3808         *reduc_fn = IFN_REDUC_AND;
3809         return true;
3810
3811       case BIT_IOR_EXPR:
3812         *reduc_fn = IFN_REDUC_IOR;
3813         return true;
3814
3815       case BIT_XOR_EXPR:
3816         *reduc_fn = IFN_REDUC_XOR;
3817         return true;
3818
3819       case MULT_EXPR:
3820       case MINUS_EXPR:
3821         *reduc_fn = IFN_LAST;
3822         return true;
3823
3824       default:
3825         return false;
3826       }
3827   else
3828     switch (combined_fn (code))
3829       {
3830       CASE_CFN_FMAX:
3831         *reduc_fn = IFN_REDUC_FMAX;
3832         return true;
3833
3834       CASE_CFN_FMIN:
3835         *reduc_fn = IFN_REDUC_FMIN;
3836         return true;
3837
3838       default:
3839         return false;
3840       }
3841 }
3842
3843 /* If there is a neutral value X such that a reduction would not be affected
3844    by the introduction of additional X elements, return that X, otherwise
3845    return null.  CODE is the code of the reduction and SCALAR_TYPE is type
3846    of the scalar elements.  If the reduction has just a single initial value
3847    then INITIAL_VALUE is that value, otherwise it is null.
3848    If AS_INITIAL is TRUE the value is supposed to be used as initial value.
3849    In that case no signed zero is returned.  */
3850
3851 tree
3852 neutral_op_for_reduction (tree scalar_type, code_helper code,
3853                           tree initial_value, bool as_initial)
3854 {
3855   if (code.is_tree_code ())
3856     switch (tree_code (code))
3857       {
3858       case DOT_PROD_EXPR:
3859       case SAD_EXPR:
3860       case MINUS_EXPR:
3861       case BIT_IOR_EXPR:
3862       case BIT_XOR_EXPR:
3863         return build_zero_cst (scalar_type);
3864       case WIDEN_SUM_EXPR:
3865       case PLUS_EXPR:
3866         if (!as_initial && HONOR_SIGNED_ZEROS (scalar_type))
3867           return build_real (scalar_type, dconstm0);
3868         else
3869           return build_zero_cst (scalar_type);
3870
3871       case MULT_EXPR:
3872         return build_one_cst (scalar_type);
3873
3874       case BIT_AND_EXPR:
3875         return build_all_ones_cst (scalar_type);
3876
3877       case MAX_EXPR:
3878       case MIN_EXPR:
3879         return initial_value;
3880
3881       default:
3882         return NULL_TREE;
3883       }
3884   else
3885     switch (combined_fn (code))
3886       {
3887       CASE_CFN_FMIN:
3888       CASE_CFN_FMAX:
3889         return initial_value;
3890
3891       default:
3892         return NULL_TREE;
3893       }
3894 }
3895
3896 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
3897    STMT is printed with a message MSG. */
3898
3899 static void
3900 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3901 {
3902   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3903 }
3904
3905 /* Return true if we need an in-order reduction for operation CODE
3906    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3907    overflow must wrap.  */
3908
3909 bool
3910 needs_fold_left_reduction_p (tree type, code_helper code)
3911 {
3912   /* CHECKME: check for !flag_finite_math_only too?  */
3913   if (SCALAR_FLOAT_TYPE_P (type))
3914     {
3915       if (code.is_tree_code ())
3916         switch (tree_code (code))
3917           {
3918           case MIN_EXPR:
3919           case MAX_EXPR:
3920             return false;
3921
3922           default:
3923             return !flag_associative_math;
3924           }
3925       else
3926         switch (combined_fn (code))
3927           {
3928           CASE_CFN_FMIN:
3929           CASE_CFN_FMAX:
3930             return false;
3931
3932           default:
3933             return !flag_associative_math;
3934           }
3935     }
3936
3937   if (INTEGRAL_TYPE_P (type))
3938     return (!code.is_tree_code ()
3939             || !operation_no_trapping_overflow (type, tree_code (code)));
3940
3941   if (SAT_FIXED_POINT_TYPE_P (type))
3942     return true;
3943
3944   return false;
3945 }
3946
3947 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3948    has a handled computation expression.  Store the main reduction
3949    operation in *CODE.  */
3950
3951 static bool
3952 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3953                       tree loop_arg, code_helper *code,
3954                       vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3955 {
3956   auto_bitmap visited;
3957   tree lookfor = PHI_RESULT (phi);
3958   ssa_op_iter curri;
3959   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3960   while (USE_FROM_PTR (curr) != loop_arg)
3961     curr = op_iter_next_use (&curri);
3962   curri.i = curri.numops;
3963   do
3964     {
3965       path.safe_push (std::make_pair (curri, curr));
3966       tree use = USE_FROM_PTR (curr);
3967       if (use == lookfor)
3968         break;
3969       gimple *def = SSA_NAME_DEF_STMT (use);
3970       if (gimple_nop_p (def)
3971           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3972         {
3973 pop:
3974           do
3975             {
3976               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3977               curri = x.first;
3978               curr = x.second;
3979               do
3980                 curr = op_iter_next_use (&curri);
3981               /* Skip already visited or non-SSA operands (from iterating
3982                  over PHI args).  */
3983               while (curr != NULL_USE_OPERAND_P
3984                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3985                          || ! bitmap_set_bit (visited,
3986                                               SSA_NAME_VERSION
3987                                                 (USE_FROM_PTR (curr)))));
3988             }
3989           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3990           if (curr == NULL_USE_OPERAND_P)
3991             break;
3992         }
3993       else
3994         {
3995           if (gimple_code (def) == GIMPLE_PHI)
3996             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3997           else
3998             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3999           while (curr != NULL_USE_OPERAND_P
4000                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
4001                      || ! bitmap_set_bit (visited,
4002                                           SSA_NAME_VERSION
4003                                             (USE_FROM_PTR (curr)))))
4004             curr = op_iter_next_use (&curri);
4005           if (curr == NULL_USE_OPERAND_P)
4006             goto pop;
4007         }
4008     }
4009   while (1);
4010   if (dump_file && (dump_flags & TDF_DETAILS))
4011     {
4012       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
4013       unsigned i;
4014       std::pair<ssa_op_iter, use_operand_p> *x;
4015       FOR_EACH_VEC_ELT (path, i, x)
4016         dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
4017       dump_printf (MSG_NOTE, "\n");
4018     }
4019
4020   /* Check whether the reduction path detected is valid.  */
4021   bool fail = path.length () == 0;
4022   bool neg = false;
4023   int sign = -1;
4024   *code = ERROR_MARK;
4025   for (unsigned i = 1; i < path.length (); ++i)
4026     {
4027       gimple *use_stmt = USE_STMT (path[i].second);
4028       gimple_match_op op;
4029       if (!gimple_extract_op (use_stmt, &op))
4030         {
4031           fail = true;
4032           break;
4033         }
4034       unsigned int opi = op.num_ops;
4035       if (gassign *assign = dyn_cast<gassign *> (use_stmt))
4036         {
4037           /* The following make sure we can compute the operand index
4038              easily plus it mostly disallows chaining via COND_EXPR condition
4039              operands.  */
4040           for (opi = 0; opi < op.num_ops; ++opi)
4041             if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
4042               break;
4043         }
4044       else if (gcall *call = dyn_cast<gcall *> (use_stmt))
4045         {
4046           for (opi = 0; opi < op.num_ops; ++opi)
4047             if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
4048               break;
4049         }
4050       if (opi == op.num_ops)
4051         {
4052           fail = true;
4053           break;
4054         }
4055       op.code = canonicalize_code (op.code, op.type);
4056       if (op.code == MINUS_EXPR)
4057         {
4058           op.code = PLUS_EXPR;
4059           /* Track whether we negate the reduction value each iteration.  */
4060           if (op.ops[1] == op.ops[opi])
4061             neg = ! neg;
4062         }
4063       if (CONVERT_EXPR_CODE_P (op.code)
4064           && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
4065         ;
4066       else if (*code == ERROR_MARK)
4067         {
4068           *code = op.code;
4069           sign = TYPE_SIGN (op.type);
4070         }
4071       else if (op.code != *code)
4072         {
4073           fail = true;
4074           break;
4075         }
4076       else if ((op.code == MIN_EXPR
4077                 || op.code == MAX_EXPR)
4078                && sign != TYPE_SIGN (op.type))
4079         {
4080           fail = true;
4081           break;
4082         }
4083       /* Check there's only a single stmt the op is used on.  For the
4084          not value-changing tail and the last stmt allow out-of-loop uses.
4085          ???  We could relax this and handle arbitrary live stmts by
4086          forcing a scalar epilogue for example.  */
4087       imm_use_iterator imm_iter;
4088       use_operand_p use_p;
4089       gimple *op_use_stmt;
4090       unsigned cnt = 0;
4091       bool cond_fn_p = op.code.is_internal_fn ()
4092         && (conditional_internal_fn_code (internal_fn (op.code))
4093             != ERROR_MARK);
4094
4095       FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
4096         {
4097         /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
4098            op1 twice (once as definition, once as else) in the same operation.
4099            Allow this.  */
4100           if (cond_fn_p)
4101             {
4102               gcall *call = dyn_cast<gcall *> (use_stmt);
4103               unsigned else_pos
4104                 = internal_fn_else_index (internal_fn (op.code));
4105
4106               for (unsigned int j = 0; j < gimple_call_num_args (call); ++j)
4107                 {
4108                   if (j == else_pos)
4109                     continue;
4110                   if (gimple_call_arg (call, j) == op.ops[opi])
4111                     cnt++;
4112                 }
4113             }
4114           else if (!is_gimple_debug (op_use_stmt)
4115                    && (*code != ERROR_MARK
4116                        || flow_bb_inside_loop_p (loop,
4117                                                  gimple_bb (op_use_stmt))))
4118             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4119               cnt++;
4120         }
4121
4122       if (cnt != 1)
4123         {
4124           fail = true;
4125           break;
4126         }
4127     }
4128   return ! fail && ! neg && *code != ERROR_MARK;
4129 }
4130
4131 bool
4132 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
4133                       tree loop_arg, enum tree_code code)
4134 {
4135   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4136   code_helper code_;
4137   return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
4138           && code_ == code);
4139 }
4140
4141
4142
4143 /* Function vect_is_simple_reduction
4144
4145    (1) Detect a cross-iteration def-use cycle that represents a simple
4146    reduction computation.  We look for the following pattern:
4147
4148    loop_header:
4149      a1 = phi < a0, a2 >
4150      a3 = ...
4151      a2 = operation (a3, a1)
4152
4153    or
4154
4155    a3 = ...
4156    loop_header:
4157      a1 = phi < a0, a2 >
4158      a2 = operation (a3, a1)
4159
4160    such that:
4161    1. operation is commutative and associative and it is safe to
4162       change the order of the computation
4163    2. no uses for a2 in the loop (a2 is used out of the loop)
4164    3. no uses of a1 in the loop besides the reduction operation
4165    4. no uses of a1 outside the loop.
4166
4167    Conditions 1,4 are tested here.
4168    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
4169
4170    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
4171    nested cycles.
4172
4173    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
4174    reductions:
4175
4176      a1 = phi < a0, a2 >
4177      inner loop (def of a3)
4178      a2 = phi < a3 >
4179
4180    (4) Detect condition expressions, ie:
4181      for (int i = 0; i < N; i++)
4182        if (a[i] < val)
4183         ret_val = a[i];
4184
4185 */
4186
4187 static stmt_vec_info
4188 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
4189                           bool *double_reduc, bool *reduc_chain_p, bool slp)
4190 {
4191   gphi *phi = as_a <gphi *> (phi_info->stmt);
4192   gimple *phi_use_stmt = NULL;
4193   imm_use_iterator imm_iter;
4194   use_operand_p use_p;
4195
4196   *double_reduc = false;
4197   *reduc_chain_p = false;
4198   STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
4199
4200   tree phi_name = PHI_RESULT (phi);
4201   /* ???  If there are no uses of the PHI result the inner loop reduction
4202      won't be detected as possibly double-reduction by vectorizable_reduction
4203      because that tries to walk the PHI arg from the preheader edge which
4204      can be constant.  See PR60382.  */
4205   if (has_zero_uses (phi_name))
4206     return NULL;
4207   class loop *loop = (gimple_bb (phi))->loop_father;
4208   unsigned nphi_def_loop_uses = 0;
4209   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
4210     {
4211       gimple *use_stmt = USE_STMT (use_p);
4212       if (is_gimple_debug (use_stmt))
4213         continue;
4214
4215       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4216         {
4217           if (dump_enabled_p ())
4218             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4219                              "intermediate value used outside loop.\n");
4220
4221           return NULL;
4222         }
4223
4224       /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
4225          op1 twice (once as definition, once as else) in the same operation.
4226          Only count it as one. */
4227       if (use_stmt != phi_use_stmt)
4228         {
4229           nphi_def_loop_uses++;
4230           phi_use_stmt = use_stmt;
4231         }
4232     }
4233
4234   tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
4235   if (TREE_CODE (latch_def) != SSA_NAME)
4236     {
4237       if (dump_enabled_p ())
4238         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4239                          "reduction: not ssa_name: %T\n", latch_def);
4240       return NULL;
4241     }
4242
4243   stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
4244   if (!def_stmt_info
4245       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
4246     return NULL;
4247
4248   bool nested_in_vect_loop
4249     = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
4250   unsigned nlatch_def_loop_uses = 0;
4251   auto_vec<gphi *, 3> lcphis;
4252   bool inner_loop_of_double_reduc = false;
4253   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
4254     {
4255       gimple *use_stmt = USE_STMT (use_p);
4256       if (is_gimple_debug (use_stmt))
4257         continue;
4258       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4259         nlatch_def_loop_uses++;
4260       else
4261         {
4262           /* We can have more than one loop-closed PHI.  */
4263           lcphis.safe_push (as_a <gphi *> (use_stmt));
4264           if (nested_in_vect_loop
4265               && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
4266                   == vect_double_reduction_def))
4267             inner_loop_of_double_reduc = true;
4268         }
4269     }
4270
4271   /* If we are vectorizing an inner reduction we are executing that
4272      in the original order only in case we are not dealing with a
4273      double reduction.  */
4274   if (nested_in_vect_loop && !inner_loop_of_double_reduc)
4275     {
4276       if (dump_enabled_p ())
4277         report_vect_op (MSG_NOTE, def_stmt_info->stmt,
4278                         "detected nested cycle: ");
4279       return def_stmt_info;
4280     }
4281
4282   /* When the inner loop of a double reduction ends up with more than
4283      one loop-closed PHI we have failed to classify alternate such
4284      PHIs as double reduction, leading to wrong code.  See PR103237.  */
4285   if (inner_loop_of_double_reduc && lcphis.length () != 1)
4286     {
4287       if (dump_enabled_p ())
4288         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4289                          "unhandle double reduction\n");
4290       return NULL;
4291     }
4292
4293   /* If this isn't a nested cycle or if the nested cycle reduction value
4294      is used ouside of the inner loop we cannot handle uses of the reduction
4295      value.  */
4296   if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
4297     {
4298       if (dump_enabled_p ())
4299         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4300                          "reduction used in loop.\n");
4301       return NULL;
4302     }
4303
4304   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
4305      defined in the inner loop.  */
4306   if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
4307     {
4308       tree op1 = PHI_ARG_DEF (def_stmt, 0);
4309       if (gimple_phi_num_args (def_stmt) != 1
4310           || TREE_CODE (op1) != SSA_NAME)
4311         {
4312           if (dump_enabled_p ())
4313             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4314                              "unsupported phi node definition.\n");
4315
4316           return NULL;
4317         }
4318
4319       /* Verify there is an inner cycle composed of the PHI phi_use_stmt
4320          and the latch definition op1.  */
4321       gimple *def1 = SSA_NAME_DEF_STMT (op1);
4322       if (gimple_bb (def1)
4323           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4324           && loop->inner
4325           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
4326           && (is_gimple_assign (def1) || is_gimple_call (def1))
4327           && is_a <gphi *> (phi_use_stmt)
4328           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt))
4329           && (op1 == PHI_ARG_DEF_FROM_EDGE (phi_use_stmt,
4330                                             loop_latch_edge (loop->inner))))
4331         {
4332           if (dump_enabled_p ())
4333             report_vect_op (MSG_NOTE, def_stmt,
4334                             "detected double reduction: ");
4335
4336           *double_reduc = true;
4337           return def_stmt_info;
4338         }
4339
4340       return NULL;
4341     }
4342
4343   /* Look for the expression computing latch_def from then loop PHI result.  */
4344   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4345   code_helper code;
4346   if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
4347                             path))
4348     {
4349       STMT_VINFO_REDUC_CODE (phi_info) = code;
4350       if (code == COND_EXPR && !nested_in_vect_loop)
4351         STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
4352
4353       /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
4354          reduction chain for which the additional restriction is that
4355          all operations in the chain are the same.  */
4356       auto_vec<stmt_vec_info, 8> reduc_chain;
4357       unsigned i;
4358       bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
4359       for (i = path.length () - 1; i >= 1; --i)
4360         {
4361           gimple *stmt = USE_STMT (path[i].second);
4362           stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
4363           gimple_match_op op;
4364           if (!gimple_extract_op (stmt, &op))
4365             gcc_unreachable ();
4366           if (gassign *assign = dyn_cast<gassign *> (stmt))
4367             STMT_VINFO_REDUC_IDX (stmt_info)
4368               = path[i].second->use - gimple_assign_rhs1_ptr (assign);
4369           else
4370             {
4371               gcall *call = as_a<gcall *> (stmt);
4372               STMT_VINFO_REDUC_IDX (stmt_info)
4373                 = path[i].second->use - gimple_call_arg_ptr (call, 0);
4374             }
4375           bool leading_conversion = (CONVERT_EXPR_CODE_P (op.code)
4376                                      && (i == 1 || i == path.length () - 1));
4377           if ((op.code != code && !leading_conversion)
4378               /* We can only handle the final value in epilogue
4379                  generation for reduction chains.  */
4380               || (i != 1 && !has_single_use (gimple_get_lhs (stmt))))
4381             is_slp_reduc = false;
4382           /* For reduction chains we support a trailing/leading
4383              conversions.  We do not store those in the actual chain.  */
4384           if (leading_conversion)
4385             continue;
4386           reduc_chain.safe_push (stmt_info);
4387         }
4388       if (slp && is_slp_reduc && reduc_chain.length () > 1)
4389         {
4390           for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
4391             {
4392               REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
4393               REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
4394             }
4395           REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
4396           REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
4397
4398           /* Save the chain for further analysis in SLP detection.  */
4399           LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
4400           REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
4401
4402           *reduc_chain_p = true;
4403           if (dump_enabled_p ())
4404             dump_printf_loc (MSG_NOTE, vect_location,
4405                             "reduction: detected reduction chain\n");
4406         }
4407       else if (dump_enabled_p ())
4408         dump_printf_loc (MSG_NOTE, vect_location,
4409                          "reduction: detected reduction\n");
4410
4411       return def_stmt_info;
4412     }
4413
4414   if (dump_enabled_p ())
4415     dump_printf_loc (MSG_NOTE, vect_location,
4416                      "reduction: unknown pattern\n");
4417
4418   return NULL;
4419 }
4420
4421 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
4422    PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
4423    or -1 if not known.  */
4424
4425 static int
4426 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
4427 {
4428   int assumed_vf = vect_vf_for_cost (loop_vinfo);
4429   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
4430     {
4431       if (dump_enabled_p ())
4432         dump_printf_loc (MSG_NOTE, vect_location,
4433                          "cost model: epilogue peel iters set to vf/2 "
4434                          "because loop iterations are unknown .\n");
4435       return assumed_vf / 2;
4436     }
4437   else
4438     {
4439       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
4440       peel_iters_prologue = MIN (niters, peel_iters_prologue);
4441       int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
4442       /* If we need to peel for gaps, but no peeling is required, we have to
4443          peel VF iterations.  */
4444       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
4445         peel_iters_epilogue = assumed_vf;
4446       return peel_iters_epilogue;
4447     }
4448 }
4449
4450 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
4451 int
4452 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
4453                              int *peel_iters_epilogue,
4454                              stmt_vector_for_cost *scalar_cost_vec,
4455                              stmt_vector_for_cost *prologue_cost_vec,
4456                              stmt_vector_for_cost *epilogue_cost_vec)
4457 {
4458   int retval = 0;
4459
4460   *peel_iters_epilogue
4461     = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
4462
4463   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
4464     {
4465       /* If peeled iterations are known but number of scalar loop
4466          iterations are unknown, count a taken branch per peeled loop.  */
4467       if (peel_iters_prologue > 0)
4468         retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
4469                                    vect_prologue);
4470       if (*peel_iters_epilogue > 0)
4471         retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
4472                                     vect_epilogue);
4473     }
4474
4475   stmt_info_for_cost *si;
4476   int j;
4477   if (peel_iters_prologue)
4478     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4479       retval += record_stmt_cost (prologue_cost_vec,
4480                                   si->count * peel_iters_prologue,
4481                                   si->kind, si->stmt_info, si->misalign,
4482                                   vect_prologue);
4483   if (*peel_iters_epilogue)
4484     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4485       retval += record_stmt_cost (epilogue_cost_vec,
4486                                   si->count * *peel_iters_epilogue,
4487                                   si->kind, si->stmt_info, si->misalign,
4488                                   vect_epilogue);
4489
4490   return retval;
4491 }
4492
4493 /* Function vect_estimate_min_profitable_iters
4494
4495    Return the number of iterations required for the vector version of the
4496    loop to be profitable relative to the cost of the scalar version of the
4497    loop.
4498
4499    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
4500    of iterations for vectorization.  -1 value means loop vectorization
4501    is not profitable.  This returned value may be used for dynamic
4502    profitability check.
4503
4504    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
4505    for static check against estimated number of iterations.  */
4506
4507 static void
4508 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
4509                                     int *ret_min_profitable_niters,
4510                                     int *ret_min_profitable_estimate,
4511                                     unsigned *suggested_unroll_factor)
4512 {
4513   int min_profitable_iters;
4514   int min_profitable_estimate;
4515   int peel_iters_prologue;
4516   int peel_iters_epilogue;
4517   unsigned vec_inside_cost = 0;
4518   int vec_outside_cost = 0;
4519   unsigned vec_prologue_cost = 0;
4520   unsigned vec_epilogue_cost = 0;
4521   int scalar_single_iter_cost = 0;
4522   int scalar_outside_cost = 0;
4523   int assumed_vf = vect_vf_for_cost (loop_vinfo);
4524   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
4525   vector_costs *target_cost_data = loop_vinfo->vector_costs;
4526
4527   /* Cost model disabled.  */
4528   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
4529     {
4530       if (dump_enabled_p ())
4531         dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
4532       *ret_min_profitable_niters = 0;
4533       *ret_min_profitable_estimate = 0;
4534       return;
4535     }
4536
4537   /* Requires loop versioning tests to handle misalignment.  */
4538   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
4539     {
4540       /*  FIXME: Make cost depend on complexity of individual check.  */
4541       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
4542       (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4543       if (dump_enabled_p ())
4544         dump_printf (MSG_NOTE,
4545                      "cost model: Adding cost of checks for loop "
4546                      "versioning to treat misalignment.\n");
4547     }
4548
4549   /* Requires loop versioning with alias checks.  */
4550   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4551     {
4552       /*  FIXME: Make cost depend on complexity of individual check.  */
4553       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
4554       (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4555       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
4556       if (len)
4557         /* Count LEN - 1 ANDs and LEN comparisons.  */
4558         (void) add_stmt_cost (target_cost_data, len * 2 - 1,
4559                               scalar_stmt, vect_prologue);
4560       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
4561       if (len)
4562         {
4563           /* Count LEN - 1 ANDs and LEN comparisons.  */
4564           unsigned int nstmts = len * 2 - 1;
4565           /* +1 for each bias that needs adding.  */
4566           for (unsigned int i = 0; i < len; ++i)
4567             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
4568               nstmts += 1;
4569           (void) add_stmt_cost (target_cost_data, nstmts,
4570                                 scalar_stmt, vect_prologue);
4571         }
4572       if (dump_enabled_p ())
4573         dump_printf (MSG_NOTE,
4574                      "cost model: Adding cost of checks for loop "
4575                      "versioning aliasing.\n");
4576     }
4577
4578   /* Requires loop versioning with niter checks.  */
4579   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
4580     {
4581       /*  FIXME: Make cost depend on complexity of individual check.  */
4582       (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
4583                             NULL, NULL, NULL_TREE, 0, vect_prologue);
4584       if (dump_enabled_p ())
4585         dump_printf (MSG_NOTE,
4586                      "cost model: Adding cost of checks for loop "
4587                      "versioning niters.\n");
4588     }
4589
4590   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4591     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4592                           vect_prologue);
4593
4594   /* Count statements in scalar loop.  Using this as scalar cost for a single
4595      iteration for now.
4596
4597      TODO: Add outer loop support.
4598
4599      TODO: Consider assigning different costs to different scalar
4600      statements.  */
4601
4602   scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost ();
4603
4604   /* Add additional cost for the peeled instructions in prologue and epilogue
4605      loop.  (For fully-masked loops there will be no peeling.)
4606
4607      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4608      at compile-time - we assume it's vf/2 (the worst would be vf-1).
4609
4610      TODO: Build an expression that represents peel_iters for prologue and
4611      epilogue to be used in a run-time test.  */
4612
4613   bool prologue_need_br_taken_cost = false;
4614   bool prologue_need_br_not_taken_cost = false;
4615
4616   /* Calculate peel_iters_prologue.  */
4617   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4618     peel_iters_prologue = 0;
4619   else if (npeel < 0)
4620     {
4621       peel_iters_prologue = assumed_vf / 2;
4622       if (dump_enabled_p ())
4623         dump_printf (MSG_NOTE, "cost model: "
4624                      "prologue peel iters set to vf/2.\n");
4625
4626       /* If peeled iterations are unknown, count a taken branch and a not taken
4627          branch per peeled loop.  Even if scalar loop iterations are known,
4628          vector iterations are not known since peeled prologue iterations are
4629          not known.  Hence guards remain the same.  */
4630       prologue_need_br_taken_cost = true;
4631       prologue_need_br_not_taken_cost = true;
4632     }
4633   else
4634     {
4635       peel_iters_prologue = npeel;
4636       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4637         /* If peeled iterations are known but number of scalar loop
4638            iterations are unknown, count a taken branch per peeled loop.  */
4639         prologue_need_br_taken_cost = true;
4640     }
4641
4642   bool epilogue_need_br_taken_cost = false;
4643   bool epilogue_need_br_not_taken_cost = false;
4644
4645   /* Calculate peel_iters_epilogue.  */
4646   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4647     /* We need to peel exactly one iteration for gaps.  */
4648     peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4649   else if (npeel < 0)
4650     {
4651       /* If peeling for alignment is unknown, loop bound of main loop
4652          becomes unknown.  */
4653       peel_iters_epilogue = assumed_vf / 2;
4654       if (dump_enabled_p ())
4655         dump_printf (MSG_NOTE, "cost model: "
4656                      "epilogue peel iters set to vf/2 because "
4657                      "peeling for alignment is unknown.\n");
4658
4659       /* See the same reason above in peel_iters_prologue calculation.  */
4660       epilogue_need_br_taken_cost = true;
4661       epilogue_need_br_not_taken_cost = true;
4662     }
4663   else
4664     {
4665       peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4666       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4667         /* If peeled iterations are known but number of scalar loop
4668            iterations are unknown, count a taken branch per peeled loop.  */
4669         epilogue_need_br_taken_cost = true;
4670     }
4671
4672   stmt_info_for_cost *si;
4673   int j;
4674   /* Add costs associated with peel_iters_prologue.  */
4675   if (peel_iters_prologue)
4676     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4677       {
4678         (void) add_stmt_cost (target_cost_data,
4679                               si->count * peel_iters_prologue, si->kind,
4680                               si->stmt_info, si->node, si->vectype,
4681                               si->misalign, vect_prologue);
4682       }
4683
4684   /* Add costs associated with peel_iters_epilogue.  */
4685   if (peel_iters_epilogue)
4686     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4687       {
4688         (void) add_stmt_cost (target_cost_data,
4689                               si->count * peel_iters_epilogue, si->kind,
4690                               si->stmt_info, si->node, si->vectype,
4691                               si->misalign, vect_epilogue);
4692       }
4693
4694   /* Add possible cond_branch_taken/cond_branch_not_taken cost.  */
4695
4696   if (prologue_need_br_taken_cost)
4697     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4698                           vect_prologue);
4699
4700   if (prologue_need_br_not_taken_cost)
4701     (void) add_stmt_cost (target_cost_data, 1,
4702                           cond_branch_not_taken, vect_prologue);
4703
4704   if (epilogue_need_br_taken_cost)
4705     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4706                           vect_epilogue);
4707
4708   if (epilogue_need_br_not_taken_cost)
4709     (void) add_stmt_cost (target_cost_data, 1,
4710                           cond_branch_not_taken, vect_epilogue);
4711
4712   /* Take care of special costs for rgroup controls of partial vectors.  */
4713   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4714       && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4715           == vect_partial_vectors_avx512))
4716     {
4717       /* Calculate how many masks we need to generate.  */
4718       unsigned int num_masks = 0;
4719       bool need_saturation = false;
4720       for (auto rgm : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
4721         if (rgm.type)
4722           {
4723             unsigned nvectors = rgm.factor;
4724             num_masks += nvectors;
4725             if (TYPE_PRECISION (TREE_TYPE (rgm.compare_type))
4726                 < TYPE_PRECISION (LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo)))
4727               need_saturation = true;
4728           }
4729
4730       /* ???  The target isn't able to identify the costs below as
4731          producing masks so it cannot penaltize cases where we'd run
4732          out of mask registers for example.  */
4733
4734       /* ???  We are also failing to account for smaller vector masks
4735          we generate by splitting larger masks in vect_get_loop_mask.  */
4736
4737       /* In the worst case, we need to generate each mask in the prologue
4738          and in the loop body.  We need one splat per group and one
4739          compare per mask.
4740
4741          Sometimes the prologue mask will fold to a constant,
4742          so the actual prologue cost might be smaller.  However, it's
4743          simpler and safer to use the worst-case cost; if this ends up
4744          being the tie-breaker between vectorizing or not, then it's
4745          probably better not to vectorize.  */
4746       (void) add_stmt_cost (target_cost_data,
4747                             num_masks
4748                             + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4749                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4750                             vect_prologue);
4751       (void) add_stmt_cost (target_cost_data,
4752                             num_masks
4753                             + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4754                             vector_stmt, NULL, NULL, NULL_TREE, 0, vect_body);
4755
4756       /* When we need saturation we need it both in the prologue and
4757          the epilogue.  */
4758       if (need_saturation)
4759         {
4760           (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4761                                 NULL, NULL, NULL_TREE, 0, vect_prologue);
4762           (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4763                                 NULL, NULL, NULL_TREE, 0, vect_body);
4764         }
4765     }
4766   else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4767            && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4768                == vect_partial_vectors_while_ult))
4769     {
4770       /* Calculate how many masks we need to generate.  */
4771       unsigned int num_masks = 0;
4772       rgroup_controls *rgm;
4773       unsigned int num_vectors_m1;
4774       FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec,
4775                         num_vectors_m1, rgm)
4776         if (rgm->type)
4777           num_masks += num_vectors_m1 + 1;
4778       gcc_assert (num_masks > 0);
4779
4780       /* In the worst case, we need to generate each mask in the prologue
4781          and in the loop body.  One of the loop body mask instructions
4782          replaces the comparison in the scalar loop, and since we don't
4783          count the scalar comparison against the scalar body, we shouldn't
4784          count that vector instruction against the vector body either.
4785
4786          Sometimes we can use unpacks instead of generating prologue
4787          masks and sometimes the prologue mask will fold to a constant,
4788          so the actual prologue cost might be smaller.  However, it's
4789          simpler and safer to use the worst-case cost; if this ends up
4790          being the tie-breaker between vectorizing or not, then it's
4791          probably better not to vectorize.  */
4792       (void) add_stmt_cost (target_cost_data, num_masks,
4793                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4794                             vect_prologue);
4795       (void) add_stmt_cost (target_cost_data, num_masks - 1,
4796                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4797                             vect_body);
4798     }
4799   else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4800     {
4801       /* Referring to the functions vect_set_loop_condition_partial_vectors
4802          and vect_set_loop_controls_directly, we need to generate each
4803          length in the prologue and in the loop body if required. Although
4804          there are some possible optimizations, we consider the worst case
4805          here.  */
4806
4807       bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4808       signed char partial_load_store_bias
4809         = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4810       bool need_iterate_p
4811         = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4812            && !vect_known_niters_smaller_than_vf (loop_vinfo));
4813
4814       /* Calculate how many statements to be added.  */
4815       unsigned int prologue_stmts = 0;
4816       unsigned int body_stmts = 0;
4817
4818       rgroup_controls *rgc;
4819       unsigned int num_vectors_m1;
4820       FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4821         if (rgc->type)
4822           {
4823             /* May need one SHIFT for nitems_total computation.  */
4824             unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4825             if (nitems != 1 && !niters_known_p)
4826               prologue_stmts += 1;
4827
4828             /* May need one MAX and one MINUS for wrap around.  */
4829             if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4830               prologue_stmts += 2;
4831
4832             /* Need one MAX and one MINUS for each batch limit excepting for
4833                the 1st one.  */
4834             prologue_stmts += num_vectors_m1 * 2;
4835
4836             unsigned int num_vectors = num_vectors_m1 + 1;
4837
4838             /* Need to set up lengths in prologue, only one MIN required
4839                for each since start index is zero.  */
4840             prologue_stmts += num_vectors;
4841
4842             /* If we have a non-zero partial load bias, we need one PLUS
4843                to adjust the load length.  */
4844             if (partial_load_store_bias != 0)
4845               body_stmts += 1;
4846
4847             /* Each may need two MINs and one MINUS to update lengths in body
4848                for next iteration.  */
4849             if (need_iterate_p)
4850               body_stmts += 3 * num_vectors;
4851           }
4852
4853       (void) add_stmt_cost (target_cost_data, prologue_stmts,
4854                             scalar_stmt, vect_prologue);
4855       (void) add_stmt_cost (target_cost_data, body_stmts,
4856                             scalar_stmt, vect_body);
4857     }
4858
4859   /* FORNOW: The scalar outside cost is incremented in one of the
4860      following ways:
4861
4862      1. The vectorizer checks for alignment and aliasing and generates
4863      a condition that allows dynamic vectorization.  A cost model
4864      check is ANDED with the versioning condition.  Hence scalar code
4865      path now has the added cost of the versioning check.
4866
4867        if (cost > th & versioning_check)
4868          jmp to vector code
4869
4870      Hence run-time scalar is incremented by not-taken branch cost.
4871
4872      2. The vectorizer then checks if a prologue is required.  If the
4873      cost model check was not done before during versioning, it has to
4874      be done before the prologue check.
4875
4876        if (cost <= th)
4877          prologue = scalar_iters
4878        if (prologue == 0)
4879          jmp to vector code
4880        else
4881          execute prologue
4882        if (prologue == num_iters)
4883          go to exit
4884
4885      Hence the run-time scalar cost is incremented by a taken branch,
4886      plus a not-taken branch, plus a taken branch cost.
4887
4888      3. The vectorizer then checks if an epilogue is required.  If the
4889      cost model check was not done before during prologue check, it
4890      has to be done with the epilogue check.
4891
4892        if (prologue == 0)
4893          jmp to vector code
4894        else
4895          execute prologue
4896        if (prologue == num_iters)
4897          go to exit
4898        vector code:
4899          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4900            jmp to epilogue
4901
4902      Hence the run-time scalar cost should be incremented by 2 taken
4903      branches.
4904
4905      TODO: The back end may reorder the BBS's differently and reverse
4906      conditions/branch directions.  Change the estimates below to
4907      something more reasonable.  */
4908
4909   /* If the number of iterations is known and we do not do versioning, we can
4910      decide whether to vectorize at compile time.  Hence the scalar version
4911      do not carry cost model guard costs.  */
4912   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4913       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4914     {
4915       /* Cost model check occurs at versioning.  */
4916       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4917         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4918       else
4919         {
4920           /* Cost model check occurs at prologue generation.  */
4921           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4922             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4923               + vect_get_stmt_cost (cond_branch_not_taken);
4924           /* Cost model check occurs at epilogue generation.  */
4925           else
4926             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4927         }
4928     }
4929
4930   /* Complete the target-specific cost calculations.  */
4931   finish_cost (loop_vinfo->vector_costs, loop_vinfo->scalar_costs,
4932                &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost,
4933                suggested_unroll_factor);
4934
4935   if (suggested_unroll_factor && *suggested_unroll_factor > 1
4936       && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
4937       && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
4938                     *suggested_unroll_factor,
4939                     LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
4940     {
4941       if (dump_enabled_p ())
4942         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4943                          "can't unroll as unrolled vectorization factor larger"
4944                          " than maximum vectorization factor: "
4945                          HOST_WIDE_INT_PRINT_UNSIGNED "\n",
4946                          LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
4947       *suggested_unroll_factor = 1;
4948     }
4949
4950   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4951
4952   if (dump_enabled_p ())
4953     {
4954       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4955       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
4956                    vec_inside_cost);
4957       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
4958                    vec_prologue_cost);
4959       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
4960                    vec_epilogue_cost);
4961       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
4962                    scalar_single_iter_cost);
4963       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
4964                    scalar_outside_cost);
4965       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
4966                    vec_outside_cost);
4967       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
4968                    peel_iters_prologue);
4969       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
4970                    peel_iters_epilogue);
4971     }
4972
4973   /* Calculate number of iterations required to make the vector version
4974      profitable, relative to the loop bodies only.  The following condition
4975      must hold true:
4976      SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4977      where
4978      SIC = scalar iteration cost, VIC = vector iteration cost,
4979      VOC = vector outside cost, VF = vectorization factor,
4980      NPEEL = prologue iterations + epilogue iterations,
4981      SOC = scalar outside cost for run time cost model check.  */
4982
4983   int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4984                           - vec_inside_cost);
4985   if (saving_per_viter <= 0)
4986     {
4987       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4988         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4989                     "vectorization did not happen for a simd loop");
4990
4991       if (dump_enabled_p ())
4992         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4993                          "cost model: the vector iteration cost = %d "
4994                          "divided by the scalar iteration cost = %d "
4995                          "is greater or equal to the vectorization factor = %d"
4996                          ".\n",
4997                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4998       *ret_min_profitable_niters = -1;
4999       *ret_min_profitable_estimate = -1;
5000       return;
5001     }
5002
5003   /* ??? The "if" arm is written to handle all cases; see below for what
5004      we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
5005   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5006     {
5007       /* Rewriting the condition above in terms of the number of
5008          vector iterations (vniters) rather than the number of
5009          scalar iterations (niters) gives:
5010
5011          SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
5012
5013          <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
5014
5015          For integer N, X and Y when X > 0:
5016
5017          N * X > Y <==> N >= (Y /[floor] X) + 1.  */
5018       int outside_overhead = (vec_outside_cost
5019                               - scalar_single_iter_cost * peel_iters_prologue
5020                               - scalar_single_iter_cost * peel_iters_epilogue
5021                               - scalar_outside_cost);
5022       /* We're only interested in cases that require at least one
5023          vector iteration.  */
5024       int min_vec_niters = 1;
5025       if (outside_overhead > 0)
5026         min_vec_niters = outside_overhead / saving_per_viter + 1;
5027
5028       if (dump_enabled_p ())
5029         dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
5030                      min_vec_niters);
5031
5032       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5033         {
5034           /* Now that we know the minimum number of vector iterations,
5035              find the minimum niters for which the scalar cost is larger:
5036
5037              SIC * niters > VIC * vniters + VOC - SOC
5038
5039              We know that the minimum niters is no more than
5040              vniters * VF + NPEEL, but it might be (and often is) less
5041              than that if a partial vector iteration is cheaper than the
5042              equivalent scalar code.  */
5043           int threshold = (vec_inside_cost * min_vec_niters
5044                            + vec_outside_cost
5045                            - scalar_outside_cost);
5046           if (threshold <= 0)
5047             min_profitable_iters = 1;
5048           else
5049             min_profitable_iters = threshold / scalar_single_iter_cost + 1;
5050         }
5051       else
5052         /* Convert the number of vector iterations into a number of
5053            scalar iterations.  */
5054         min_profitable_iters = (min_vec_niters * assumed_vf
5055                                 + peel_iters_prologue
5056                                 + peel_iters_epilogue);
5057     }
5058   else
5059     {
5060       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
5061                               * assumed_vf
5062                               - vec_inside_cost * peel_iters_prologue
5063                               - vec_inside_cost * peel_iters_epilogue);
5064       if (min_profitable_iters <= 0)
5065         min_profitable_iters = 0;
5066       else
5067         {
5068           min_profitable_iters /= saving_per_viter;
5069
5070           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
5071               <= (((int) vec_inside_cost * min_profitable_iters)
5072                   + (((int) vec_outside_cost - scalar_outside_cost)
5073                      * assumed_vf)))
5074             min_profitable_iters++;
5075         }
5076     }
5077
5078   if (dump_enabled_p ())
5079     dump_printf (MSG_NOTE,
5080                  "  Calculated minimum iters for profitability: %d\n",
5081                  min_profitable_iters);
5082
5083   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
5084       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
5085     /* We want the vectorized loop to execute at least once.  */
5086     min_profitable_iters = assumed_vf + peel_iters_prologue;
5087   else if (min_profitable_iters < peel_iters_prologue)
5088     /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
5089        vectorized loop executes at least once.  */
5090     min_profitable_iters = peel_iters_prologue;
5091
5092   if (dump_enabled_p ())
5093     dump_printf_loc (MSG_NOTE, vect_location,
5094                      "  Runtime profitability threshold = %d\n",
5095                      min_profitable_iters);
5096
5097   *ret_min_profitable_niters = min_profitable_iters;
5098
5099   /* Calculate number of iterations required to make the vector version
5100      profitable, relative to the loop bodies only.
5101
5102      Non-vectorized variant is SIC * niters and it must win over vector
5103      variant on the expected loop trip count.  The following condition must hold true:
5104      SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
5105
5106   if (vec_outside_cost <= 0)
5107     min_profitable_estimate = 0;
5108   /* ??? This "else if" arm is written to handle all cases; see below for
5109      what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
5110   else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5111     {
5112       /* This is a repeat of the code above, but with + SOC rather
5113          than - SOC.  */
5114       int outside_overhead = (vec_outside_cost
5115                               - scalar_single_iter_cost * peel_iters_prologue
5116                               - scalar_single_iter_cost * peel_iters_epilogue
5117                               + scalar_outside_cost);
5118       int min_vec_niters = 1;
5119       if (outside_overhead > 0)
5120         min_vec_niters = outside_overhead / saving_per_viter + 1;
5121
5122       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5123         {
5124           int threshold = (vec_inside_cost * min_vec_niters
5125                            + vec_outside_cost
5126                            + scalar_outside_cost);
5127           min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
5128         }
5129       else
5130         min_profitable_estimate = (min_vec_niters * assumed_vf
5131                                    + peel_iters_prologue
5132                                    + peel_iters_epilogue);
5133     }
5134   else
5135     {
5136       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
5137                                  * assumed_vf
5138                                  - vec_inside_cost * peel_iters_prologue
5139                                  - vec_inside_cost * peel_iters_epilogue)
5140                                  / ((scalar_single_iter_cost * assumed_vf)
5141                                    - vec_inside_cost);
5142     }
5143   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
5144   if (dump_enabled_p ())
5145     dump_printf_loc (MSG_NOTE, vect_location,
5146                      "  Static estimate profitability threshold = %d\n",
5147                      min_profitable_estimate);
5148
5149   *ret_min_profitable_estimate = min_profitable_estimate;
5150 }
5151
5152 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
5153    vector elements (not bits) for a vector with NELT elements.  */
5154 static void
5155 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
5156                               vec_perm_builder *sel)
5157 {
5158   /* The encoding is a single stepped pattern.  Any wrap-around is handled
5159      by vec_perm_indices.  */
5160   sel->new_vector (nelt, 1, 3);
5161   for (unsigned int i = 0; i < 3; i++)
5162     sel->quick_push (i + offset);
5163 }
5164
5165 /* Checks whether the target supports whole-vector shifts for vectors of mode
5166    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
5167    it supports vec_perm_const with masks for all necessary shift amounts.  */
5168 static bool
5169 have_whole_vector_shift (machine_mode mode)
5170 {
5171   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
5172     return true;
5173
5174   /* Variable-length vectors should be handled via the optab.  */
5175   unsigned int nelt;
5176   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5177     return false;
5178
5179   vec_perm_builder sel;
5180   vec_perm_indices indices;
5181   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
5182     {
5183       calc_vec_perm_mask_for_shift (i, nelt, &sel);
5184       indices.new_vector (sel, 2, nelt);
5185       if (!can_vec_perm_const_p (mode, mode, indices, false))
5186         return false;
5187     }
5188   return true;
5189 }
5190
5191 /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
5192    multiplication operands have differing signs and (b) we intend
5193    to emulate the operation using a series of signed DOT_PROD_EXPRs.
5194    See vect_emulate_mixed_dot_prod for the actual sequence used.  */
5195
5196 static bool
5197 vect_is_emulated_mixed_dot_prod (loop_vec_info loop_vinfo,
5198                                  stmt_vec_info stmt_info)
5199 {
5200   gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
5201   if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
5202     return false;
5203
5204   tree rhs1 = gimple_assign_rhs1 (assign);
5205   tree rhs2 = gimple_assign_rhs2 (assign);
5206   if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
5207     return false;
5208
5209   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5210   gcc_assert (reduc_info->is_reduc_info);
5211   return !directly_supported_p (DOT_PROD_EXPR,
5212                                 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info),
5213                                 optab_vector_mixed_sign);
5214 }
5215
5216 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
5217    functions. Design better to avoid maintenance issues.  */
5218
5219 /* Function vect_model_reduction_cost.
5220
5221    Models cost for a reduction operation, including the vector ops
5222    generated within the strip-mine loop in some cases, the initial
5223    definition before the loop, and the epilogue code that must be generated.  */
5224
5225 static void
5226 vect_model_reduction_cost (loop_vec_info loop_vinfo,
5227                            stmt_vec_info stmt_info, internal_fn reduc_fn,
5228                            vect_reduction_type reduction_type,
5229                            int ncopies, stmt_vector_for_cost *cost_vec)
5230 {
5231   int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
5232   tree vectype;
5233   machine_mode mode;
5234   class loop *loop = NULL;
5235
5236   if (loop_vinfo)
5237     loop = LOOP_VINFO_LOOP (loop_vinfo);
5238
5239   /* Condition reductions generate two reductions in the loop.  */
5240   if (reduction_type == COND_REDUCTION)
5241     ncopies *= 2;
5242
5243   vectype = STMT_VINFO_VECTYPE (stmt_info);
5244   mode = TYPE_MODE (vectype);
5245   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5246
5247   gimple_match_op op;
5248   if (!gimple_extract_op (orig_stmt_info->stmt, &op))
5249     gcc_unreachable ();
5250
5251   bool emulated_mixed_dot_prod
5252     = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
5253   if (reduction_type == EXTRACT_LAST_REDUCTION)
5254     /* No extra instructions are needed in the prologue.  The loop body
5255        operations are costed in vectorizable_condition.  */
5256     inside_cost = 0;
5257   else if (reduction_type == FOLD_LEFT_REDUCTION)
5258     {
5259       /* No extra instructions needed in the prologue.  */
5260       prologue_cost = 0;
5261
5262       if (reduc_fn != IFN_LAST)
5263         /* Count one reduction-like operation per vector.  */
5264         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
5265                                         stmt_info, 0, vect_body);
5266       else
5267         {
5268           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
5269           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
5270           inside_cost = record_stmt_cost (cost_vec, nelements,
5271                                           vec_to_scalar, stmt_info, 0,
5272                                           vect_body);
5273           inside_cost += record_stmt_cost (cost_vec, nelements,
5274                                            scalar_stmt, stmt_info, 0,
5275                                            vect_body);
5276         }
5277     }
5278   else
5279     {
5280       /* Add in the cost of the initial definitions.  */
5281       int prologue_stmts;
5282       if (reduction_type == COND_REDUCTION)
5283         /* For cond reductions we have four vectors: initial index, step,
5284            initial result of the data reduction, initial value of the index
5285            reduction.  */
5286         prologue_stmts = 4;
5287       else if (emulated_mixed_dot_prod)
5288         /* We need the initial reduction value and two invariants:
5289            one that contains the minimum signed value and one that
5290            contains half of its negative.  */
5291         prologue_stmts = 3;
5292       else
5293         prologue_stmts = 1;
5294       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
5295                                          scalar_to_vec, stmt_info, 0,
5296                                          vect_prologue);
5297     }
5298
5299   /* Determine cost of epilogue code.
5300
5301      We have a reduction operator that will reduce the vector in one statement.
5302      Also requires scalar extract.  */
5303
5304   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
5305     {
5306       if (reduc_fn != IFN_LAST)
5307         {
5308           if (reduction_type == COND_REDUCTION)
5309             {
5310               /* An EQ stmt and an COND_EXPR stmt.  */
5311               epilogue_cost += record_stmt_cost (cost_vec, 2,
5312                                                  vector_stmt, stmt_info, 0,
5313                                                  vect_epilogue);
5314               /* Reduction of the max index and a reduction of the found
5315                  values.  */
5316               epilogue_cost += record_stmt_cost (cost_vec, 2,
5317                                                  vec_to_scalar, stmt_info, 0,
5318                                                  vect_epilogue);
5319               /* A broadcast of the max value.  */
5320               epilogue_cost += record_stmt_cost (cost_vec, 1,
5321                                                  scalar_to_vec, stmt_info, 0,
5322                                                  vect_epilogue);
5323             }
5324           else
5325             {
5326               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
5327                                                  stmt_info, 0, vect_epilogue);
5328               epilogue_cost += record_stmt_cost (cost_vec, 1,
5329                                                  vec_to_scalar, stmt_info, 0,
5330                                                  vect_epilogue);
5331             }
5332         }
5333       else if (reduction_type == COND_REDUCTION)
5334         {
5335           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
5336           /* Extraction of scalar elements.  */
5337           epilogue_cost += record_stmt_cost (cost_vec,
5338                                              2 * estimated_nunits,
5339                                              vec_to_scalar, stmt_info, 0,
5340                                              vect_epilogue);
5341           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
5342           epilogue_cost += record_stmt_cost (cost_vec,
5343                                              2 * estimated_nunits - 3,
5344                                              scalar_stmt, stmt_info, 0,
5345                                              vect_epilogue);
5346         }
5347       else if (reduction_type == EXTRACT_LAST_REDUCTION
5348                || reduction_type == FOLD_LEFT_REDUCTION)
5349         /* No extra instructions need in the epilogue.  */
5350         ;
5351       else
5352         {
5353           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5354           tree bitsize = TYPE_SIZE (op.type);
5355           int element_bitsize = tree_to_uhwi (bitsize);
5356           int nelements = vec_size_in_bits / element_bitsize;
5357
5358           if (op.code == COND_EXPR)
5359             op.code = MAX_EXPR;
5360
5361           /* We have a whole vector shift available.  */
5362           if (VECTOR_MODE_P (mode)
5363               && directly_supported_p (op.code, vectype)
5364               && have_whole_vector_shift (mode))
5365             {
5366               /* Final reduction via vector shifts and the reduction operator.
5367                  Also requires scalar extract.  */
5368               epilogue_cost += record_stmt_cost (cost_vec,
5369                                                  exact_log2 (nelements) * 2,
5370                                                  vector_stmt, stmt_info, 0,
5371                                                  vect_epilogue);
5372               epilogue_cost += record_stmt_cost (cost_vec, 1,
5373                                                  vec_to_scalar, stmt_info, 0,
5374                                                  vect_epilogue);
5375             }
5376           else
5377             /* Use extracts and reduction op for final reduction.  For N
5378                elements, we have N extracts and N-1 reduction ops.  */
5379             epilogue_cost += record_stmt_cost (cost_vec,
5380                                                nelements + nelements - 1,
5381                                                vector_stmt, stmt_info, 0,
5382                                                vect_epilogue);
5383         }
5384     }
5385
5386   if (dump_enabled_p ())
5387     dump_printf (MSG_NOTE,
5388                  "vect_model_reduction_cost: inside_cost = %d, "
5389                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
5390                  prologue_cost, epilogue_cost);
5391 }
5392
5393 /* SEQ is a sequence of instructions that initialize the reduction
5394    described by REDUC_INFO.  Emit them in the appropriate place.  */
5395
5396 static void
5397 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
5398                                 stmt_vec_info reduc_info, gimple *seq)
5399 {
5400   if (reduc_info->reused_accumulator)
5401     {
5402       /* When reusing an accumulator from the main loop, we only need
5403          initialization instructions if the main loop can be skipped.
5404          In that case, emit the initialization instructions at the end
5405          of the guard block that does the skip.  */
5406       edge skip_edge = loop_vinfo->skip_main_loop_edge;
5407       gcc_assert (skip_edge);
5408       gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
5409       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
5410     }
5411   else
5412     {
5413       /* The normal case: emit the initialization instructions on the
5414          preheader edge.  */
5415       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5416       gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
5417     }
5418 }
5419
5420 /* Function get_initial_def_for_reduction
5421
5422    Input:
5423    REDUC_INFO - the info_for_reduction
5424    INIT_VAL - the initial value of the reduction variable
5425    NEUTRAL_OP - a value that has no effect on the reduction, as per
5426                 neutral_op_for_reduction
5427
5428    Output:
5429    Return a vector variable, initialized according to the operation that
5430         STMT_VINFO performs. This vector will be used as the initial value
5431         of the vector of partial results.
5432
5433    The value we need is a vector in which element 0 has value INIT_VAL
5434    and every other element has value NEUTRAL_OP.  */
5435
5436 static tree
5437 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
5438                                stmt_vec_info reduc_info,
5439                                tree init_val, tree neutral_op)
5440 {
5441   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5442   tree scalar_type = TREE_TYPE (init_val);
5443   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
5444   tree init_def;
5445   gimple_seq stmts = NULL;
5446
5447   gcc_assert (vectype);
5448
5449   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
5450               || SCALAR_FLOAT_TYPE_P (scalar_type));
5451
5452   gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
5453               || loop == (gimple_bb (reduc_info->stmt))->loop_father);
5454
5455   if (operand_equal_p (init_val, neutral_op))
5456     {
5457       /* If both elements are equal then the vector described above is
5458          just a splat.  */
5459       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5460       init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
5461     }
5462   else
5463     {
5464       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5465       init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
5466       if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
5467         {
5468           /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
5469              element 0.  */
5470           init_def = gimple_build_vector_from_val (&stmts, vectype,
5471                                                    neutral_op);
5472           init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
5473                                    vectype, init_def, init_val);
5474         }
5475       else
5476         {
5477           /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}.  */
5478           tree_vector_builder elts (vectype, 1, 2);
5479           elts.quick_push (init_val);
5480           elts.quick_push (neutral_op);
5481           init_def = gimple_build_vector (&stmts, &elts);
5482         }
5483     }
5484
5485   if (stmts)
5486     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
5487   return init_def;
5488 }
5489
5490 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
5491    which performs a reduction involving GROUP_SIZE scalar statements.
5492    NUMBER_OF_VECTORS is the number of vector defs to create.  If NEUTRAL_OP
5493    is nonnull, introducing extra elements of that value will not change the
5494    result.  */
5495
5496 static void
5497 get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
5498                                 stmt_vec_info reduc_info,
5499                                 vec<tree> *vec_oprnds,
5500                                 unsigned int number_of_vectors,
5501                                 unsigned int group_size, tree neutral_op)
5502 {
5503   vec<tree> &initial_values = reduc_info->reduc_initial_values;
5504   unsigned HOST_WIDE_INT nunits;
5505   unsigned j, number_of_places_left_in_vector;
5506   tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
5507   unsigned int i;
5508
5509   gcc_assert (group_size == initial_values.length () || neutral_op);
5510
5511   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5512      created vectors. It is greater than 1 if unrolling is performed.
5513
5514      For example, we have two scalar operands, s1 and s2 (e.g., group of
5515      strided accesses of size two), while NUNITS is four (i.e., four scalars
5516      of this type can be packed in a vector).  The output vector will contain
5517      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
5518      will be 2).
5519
5520      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
5521      vectors containing the operands.
5522
5523      For example, NUNITS is four as before, and the group size is 8
5524      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
5525      {s5, s6, s7, s8}.  */
5526
5527   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
5528     nunits = group_size;
5529
5530   number_of_places_left_in_vector = nunits;
5531   bool constant_p = true;
5532   tree_vector_builder elts (vector_type, nunits, 1);
5533   elts.quick_grow (nunits);
5534   gimple_seq ctor_seq = NULL;
5535   for (j = 0; j < nunits * number_of_vectors; ++j)
5536     {
5537       tree op;
5538       i = j % group_size;
5539
5540       /* Get the def before the loop.  In reduction chain we have only
5541          one initial value.  Else we have as many as PHIs in the group.  */
5542       if (i >= initial_values.length () || (j > i && neutral_op))
5543         op = neutral_op;
5544       else
5545         op = initial_values[i];
5546
5547       /* Create 'vect_ = {op0,op1,...,opn}'.  */
5548       number_of_places_left_in_vector--;
5549       elts[nunits - number_of_places_left_in_vector - 1] = op;
5550       if (!CONSTANT_CLASS_P (op))
5551         constant_p = false;
5552
5553       if (number_of_places_left_in_vector == 0)
5554         {
5555           tree init;
5556           if (constant_p && !neutral_op
5557               ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
5558               : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
5559             /* Build the vector directly from ELTS.  */
5560             init = gimple_build_vector (&ctor_seq, &elts);
5561           else if (neutral_op)
5562             {
5563               /* Build a vector of the neutral value and shift the
5564                  other elements into place.  */
5565               init = gimple_build_vector_from_val (&ctor_seq, vector_type,
5566                                                    neutral_op);
5567               int k = nunits;
5568               while (k > 0 && elts[k - 1] == neutral_op)
5569                 k -= 1;
5570               while (k > 0)
5571                 {
5572                   k -= 1;
5573                   init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
5574                                        vector_type, init, elts[k]);
5575                 }
5576             }
5577           else
5578             {
5579               /* First time round, duplicate ELTS to fill the
5580                  required number of vectors.  */
5581               duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
5582                                         elts, number_of_vectors, *vec_oprnds);
5583               break;
5584             }
5585           vec_oprnds->quick_push (init);
5586
5587           number_of_places_left_in_vector = nunits;
5588           elts.new_vector (vector_type, nunits, 1);
5589           elts.quick_grow (nunits);
5590           constant_p = true;
5591         }
5592     }
5593   if (ctor_seq != NULL)
5594     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
5595 }
5596
5597 /* For a statement STMT_INFO taking part in a reduction operation return
5598    the stmt_vec_info the meta information is stored on.  */
5599
5600 stmt_vec_info
5601 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
5602 {
5603   stmt_info = vect_orig_stmt (stmt_info);
5604   gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
5605   if (!is_a <gphi *> (stmt_info->stmt)
5606       || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
5607     stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5608   gphi *phi = as_a <gphi *> (stmt_info->stmt);
5609   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5610     {
5611       if (gimple_phi_num_args (phi) == 1)
5612         stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5613     }
5614   else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
5615     {
5616       stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
5617       if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
5618         stmt_info = info;
5619     }
5620   return stmt_info;
5621 }
5622
5623 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
5624    REDUC_INFO can build on.  Adjust REDUC_INFO and return true if so, otherwise
5625    return false.  */
5626
5627 static bool
5628 vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
5629                                 stmt_vec_info reduc_info)
5630 {
5631   loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
5632   if (!main_loop_vinfo)
5633     return false;
5634
5635   if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
5636     return false;
5637
5638   unsigned int num_phis = reduc_info->reduc_initial_values.length ();
5639   auto_vec<tree, 16> main_loop_results (num_phis);
5640   auto_vec<tree, 16> initial_values (num_phis);
5641   if (edge main_loop_edge = loop_vinfo->main_loop_edge)
5642     {
5643       /* The epilogue loop can be entered either from the main loop or
5644          from an earlier guard block.  */
5645       edge skip_edge = loop_vinfo->skip_main_loop_edge;
5646       for (tree incoming_value : reduc_info->reduc_initial_values)
5647         {
5648           /* Look for:
5649
5650                INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
5651                                     INITIAL_VALUE(guard block)>.  */
5652           gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
5653
5654           gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
5655           gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
5656
5657           tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
5658           tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
5659
5660           main_loop_results.quick_push (from_main_loop);
5661           initial_values.quick_push (from_skip);
5662         }
5663     }
5664   else
5665     /* The main loop dominates the epilogue loop.  */
5666     main_loop_results.splice (reduc_info->reduc_initial_values);
5667
5668   /* See if the main loop has the kind of accumulator we need.  */
5669   vect_reusable_accumulator *accumulator
5670     = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
5671   if (!accumulator
5672       || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
5673       || !std::equal (main_loop_results.begin (), main_loop_results.end (),
5674                       accumulator->reduc_info->reduc_scalar_results.begin ()))
5675     return false;
5676
5677   /* Handle the case where we can reduce wider vectors to narrower ones.  */
5678   tree vectype = STMT_VINFO_VECTYPE (reduc_info);
5679   tree old_vectype = TREE_TYPE (accumulator->reduc_input);
5680   unsigned HOST_WIDE_INT m;
5681   if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
5682                             TYPE_VECTOR_SUBPARTS (vectype), &m))
5683     return false;
5684   /* Check the intermediate vector types and operations are available.  */
5685   tree prev_vectype = old_vectype;
5686   poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
5687   while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
5688     {
5689       intermediate_nunits = exact_div (intermediate_nunits, 2);
5690       tree intermediate_vectype = get_related_vectype_for_scalar_type
5691         (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
5692       if (!intermediate_vectype
5693           || !directly_supported_p (STMT_VINFO_REDUC_CODE (reduc_info),
5694                                     intermediate_vectype)
5695           || !can_vec_extract (TYPE_MODE (prev_vectype),
5696                                TYPE_MODE (intermediate_vectype)))
5697         return false;
5698       prev_vectype = intermediate_vectype;
5699     }
5700
5701   /* Non-SLP reductions might apply an adjustment after the reduction
5702      operation, in order to simplify the initialization of the accumulator.
5703      If the epilogue loop carries on from where the main loop left off,
5704      it should apply the same adjustment to the final reduction result.
5705
5706      If the epilogue loop can also be entered directly (rather than via
5707      the main loop), we need to be able to handle that case in the same way,
5708      with the same adjustment.  (In principle we could add a PHI node
5709      to select the correct adjustment, but in practice that shouldn't be
5710      necessary.)  */
5711   tree main_adjustment
5712     = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5713   if (loop_vinfo->main_loop_edge && main_adjustment)
5714     {
5715       gcc_assert (num_phis == 1);
5716       tree initial_value = initial_values[0];
5717       /* Check that we can use INITIAL_VALUE as the adjustment and
5718          initialize the accumulator with a neutral value instead.  */
5719       if (!operand_equal_p (initial_value, main_adjustment))
5720         return false;
5721       code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5722       initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
5723                                                     code, initial_value);
5724     }
5725   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5726   reduc_info->reduc_initial_values.truncate (0);
5727   reduc_info->reduc_initial_values.splice (initial_values);
5728   reduc_info->reused_accumulator = accumulator;
5729   return true;
5730 }
5731
5732 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5733    CODE emitting stmts before GSI.  Returns a vector def of VECTYPE.  */
5734
5735 static tree
5736 vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5737                             gimple_seq *seq)
5738 {
5739   unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5740   unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5741   tree stype = TREE_TYPE (vectype);
5742   tree new_temp = vec_def;
5743   while (nunits > nunits1)
5744     {
5745       nunits /= 2;
5746       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5747                                                            stype, nunits);
5748       unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5749
5750       /* The target has to make sure we support lowpart/highpart
5751          extraction, either via direct vector extract or through
5752          an integer mode punning.  */
5753       tree dst1, dst2;
5754       gimple *epilog_stmt;
5755       if (convert_optab_handler (vec_extract_optab,
5756                                  TYPE_MODE (TREE_TYPE (new_temp)),
5757                                  TYPE_MODE (vectype1))
5758           != CODE_FOR_nothing)
5759         {
5760           /* Extract sub-vectors directly once vec_extract becomes
5761              a conversion optab.  */
5762           dst1 = make_ssa_name (vectype1);
5763           epilog_stmt
5764               = gimple_build_assign (dst1, BIT_FIELD_REF,
5765                                      build3 (BIT_FIELD_REF, vectype1,
5766                                              new_temp, TYPE_SIZE (vectype1),
5767                                              bitsize_int (0)));
5768           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5769           dst2 =  make_ssa_name (vectype1);
5770           epilog_stmt
5771               = gimple_build_assign (dst2, BIT_FIELD_REF,
5772                                      build3 (BIT_FIELD_REF, vectype1,
5773                                              new_temp, TYPE_SIZE (vectype1),
5774                                              bitsize_int (bitsize)));
5775           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5776         }
5777       else
5778         {
5779           /* Extract via punning to appropriately sized integer mode
5780              vector.  */
5781           tree eltype = build_nonstandard_integer_type (bitsize, 1);
5782           tree etype = build_vector_type (eltype, 2);
5783           gcc_assert (convert_optab_handler (vec_extract_optab,
5784                                              TYPE_MODE (etype),
5785                                              TYPE_MODE (eltype))
5786                       != CODE_FOR_nothing);
5787           tree tem = make_ssa_name (etype);
5788           epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5789                                              build1 (VIEW_CONVERT_EXPR,
5790                                                      etype, new_temp));
5791           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5792           new_temp = tem;
5793           tem = make_ssa_name (eltype);
5794           epilog_stmt
5795               = gimple_build_assign (tem, BIT_FIELD_REF,
5796                                      build3 (BIT_FIELD_REF, eltype,
5797                                              new_temp, TYPE_SIZE (eltype),
5798                                              bitsize_int (0)));
5799           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5800           dst1 = make_ssa_name (vectype1);
5801           epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5802                                              build1 (VIEW_CONVERT_EXPR,
5803                                                      vectype1, tem));
5804           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5805           tem = make_ssa_name (eltype);
5806           epilog_stmt
5807               = gimple_build_assign (tem, BIT_FIELD_REF,
5808                                      build3 (BIT_FIELD_REF, eltype,
5809                                              new_temp, TYPE_SIZE (eltype),
5810                                              bitsize_int (bitsize)));
5811           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5812           dst2 =  make_ssa_name (vectype1);
5813           epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5814                                              build1 (VIEW_CONVERT_EXPR,
5815                                                      vectype1, tem));
5816           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5817         }
5818
5819       new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5820     }
5821
5822   return new_temp;
5823 }
5824
5825 /* Function vect_create_epilog_for_reduction
5826
5827    Create code at the loop-epilog to finalize the result of a reduction
5828    computation.
5829
5830    STMT_INFO is the scalar reduction stmt that is being vectorized.
5831    SLP_NODE is an SLP node containing a group of reduction statements. The
5832      first one in this group is STMT_INFO.
5833    SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5834    REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5835      (counting from 0)
5836
5837    This function:
5838    1. Completes the reduction def-use cycles.
5839    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5840       by calling the function specified by REDUC_FN if available, or by
5841       other means (whole-vector shifts or a scalar loop).
5842       The function also creates a new phi node at the loop exit to preserve
5843       loop-closed form, as illustrated below.
5844
5845      The flow at the entry to this function:
5846
5847         loop:
5848           vec_def = phi <vec_init, null>        # REDUCTION_PHI
5849           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5850           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5851         loop_exit:
5852           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5853           use <s_out0>
5854           use <s_out0>
5855
5856      The above is transformed by this function into:
5857
5858         loop:
5859           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
5860           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5861           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5862         loop_exit:
5863           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5864           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5865           v_out2 = reduce <v_out1>
5866           s_out3 = extract_field <v_out2, 0>
5867           s_out4 = adjust_result <s_out3>
5868           use <s_out4>
5869           use <s_out4>
5870 */
5871
5872 static void
5873 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5874                                   stmt_vec_info stmt_info,
5875                                   slp_tree slp_node,
5876                                   slp_instance slp_node_instance)
5877 {
5878   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5879   gcc_assert (reduc_info->is_reduc_info);
5880   /* For double reductions we need to get at the inner loop reduction
5881      stmt which has the meta info attached.  Our stmt_info is that of the
5882      loop-closed PHI of the inner loop which we remember as
5883      def for the reduction PHI generation.  */
5884   bool double_reduc = false;
5885   stmt_vec_info rdef_info = stmt_info;
5886   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5887     {
5888       gcc_assert (!slp_node);
5889       double_reduc = true;
5890       stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5891                                             (stmt_info->stmt, 0));
5892       stmt_info = vect_stmt_to_vectorize (stmt_info);
5893     }
5894   gphi *reduc_def_stmt
5895     = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
5896   code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5897   internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
5898   tree vectype;
5899   machine_mode mode;
5900   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
5901   basic_block exit_bb;
5902   tree scalar_dest;
5903   tree scalar_type;
5904   gimple *new_phi = NULL, *phi = NULL;
5905   gimple_stmt_iterator exit_gsi;
5906   tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5907   gimple *epilog_stmt = NULL;
5908   gimple *exit_phi;
5909   tree bitsize;
5910   tree def;
5911   tree orig_name, scalar_result;
5912   imm_use_iterator imm_iter, phi_imm_iter;
5913   use_operand_p use_p, phi_use_p;
5914   gimple *use_stmt;
5915   auto_vec<tree> reduc_inputs;
5916   int j, i;
5917   vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
5918   unsigned int group_size = 1, k;
5919   auto_vec<gimple *> phis;
5920   /* SLP reduction without reduction chain, e.g.,
5921      # a1 = phi <a2, a0>
5922      # b1 = phi <b2, b0>
5923      a2 = operation (a1)
5924      b2 = operation (b1)  */
5925   bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5926   bool direct_slp_reduc;
5927   tree induction_index = NULL_TREE;
5928
5929   if (slp_node)
5930     group_size = SLP_TREE_LANES (slp_node);
5931
5932   if (nested_in_vect_loop_p (loop, stmt_info))
5933     {
5934       outer_loop = loop;
5935       loop = loop->inner;
5936       gcc_assert (!slp_node && double_reduc);
5937     }
5938
5939   vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
5940   gcc_assert (vectype);
5941   mode = TYPE_MODE (vectype);
5942
5943   tree induc_val = NULL_TREE;
5944   tree adjustment_def = NULL;
5945   if (slp_node)
5946     ;
5947   else
5948     {
5949       /* Optimize: for induction condition reduction, if we can't use zero
5950          for induc_val, use initial_def.  */
5951       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5952         induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
5953       else if (double_reduc)
5954         ;
5955       else
5956         adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
5957     }
5958
5959   stmt_vec_info single_live_out_stmt[] = { stmt_info };
5960   array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
5961   if (slp_reduc)
5962     /* All statements produce live-out values.  */
5963     live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
5964   else if (slp_node)
5965     {
5966       /* The last statement in the reduction chain produces the live-out
5967          value.  Note SLP optimization can shuffle scalar stmts to
5968          optimize permutations so we have to search for the last stmt.  */
5969       for (k = 0; k < group_size; ++k)
5970         if (!REDUC_GROUP_NEXT_ELEMENT (SLP_TREE_SCALAR_STMTS (slp_node)[k]))
5971           {
5972             single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5973             break;
5974           }
5975     }
5976
5977   unsigned vec_num;
5978   int ncopies;
5979   if (slp_node)
5980     {
5981       vec_num = SLP_TREE_VEC_DEFS (slp_node_instance->reduc_phis).length ();
5982       ncopies = 1;
5983     }
5984   else
5985     {
5986       stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
5987       vec_num = 1;
5988       ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
5989     }
5990
5991   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5992      which is updated with the current index of the loop for every match of
5993      the original loop's cond_expr (VEC_STMT).  This results in a vector
5994      containing the last time the condition passed for that vector lane.
5995      The first match will be a 1 to allow 0 to be used for non-matching
5996      indexes.  If there are no matches at all then the vector will be all
5997      zeroes.
5998
5999      PR92772: This algorithm is broken for architectures that support
6000      masked vectors, but do not provide fold_extract_last.  */
6001   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
6002     {
6003       auto_vec<std::pair<tree, bool>, 2> ccompares;
6004       stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
6005       cond_info = vect_stmt_to_vectorize (cond_info);
6006       while (cond_info != reduc_info)
6007         {
6008           if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
6009             {
6010               gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
6011               gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
6012               ccompares.safe_push
6013                 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
6014                                  STMT_VINFO_REDUC_IDX (cond_info) == 2));
6015             }
6016           cond_info
6017             = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
6018                                                  1 + STMT_VINFO_REDUC_IDX
6019                                                         (cond_info)));
6020           cond_info = vect_stmt_to_vectorize (cond_info);
6021         }
6022       gcc_assert (ccompares.length () != 0);
6023
6024       tree indx_before_incr, indx_after_incr;
6025       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
6026       int scalar_precision
6027         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
6028       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
6029       tree cr_index_vector_type = get_related_vectype_for_scalar_type
6030         (TYPE_MODE (vectype), cr_index_scalar_type,
6031          TYPE_VECTOR_SUBPARTS (vectype));
6032
6033       /* First we create a simple vector induction variable which starts
6034          with the values {1,2,3,...} (SERIES_VECT) and increments by the
6035          vector size (STEP).  */
6036
6037       /* Create a {1,2,3,...} vector.  */
6038       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
6039
6040       /* Create a vector of the step value.  */
6041       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
6042       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
6043
6044       /* Create an induction variable.  */
6045       gimple_stmt_iterator incr_gsi;
6046       bool insert_after;
6047       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
6048       create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, &incr_gsi,
6049                  insert_after, &indx_before_incr, &indx_after_incr);
6050
6051       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
6052          filled with zeros (VEC_ZERO).  */
6053
6054       /* Create a vector of 0s.  */
6055       tree zero = build_zero_cst (cr_index_scalar_type);
6056       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
6057
6058       /* Create a vector phi node.  */
6059       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
6060       new_phi = create_phi_node (new_phi_tree, loop->header);
6061       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
6062                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
6063
6064       /* Now take the condition from the loops original cond_exprs
6065          and produce a new cond_exprs (INDEX_COND_EXPR) which for
6066          every match uses values from the induction variable
6067          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
6068          (NEW_PHI_TREE).
6069          Finally, we update the phi (NEW_PHI_TREE) to take the value of
6070          the new cond_expr (INDEX_COND_EXPR).  */
6071       gimple_seq stmts = NULL;
6072       for (int i = ccompares.length () - 1; i != -1; --i)
6073         {
6074           tree ccompare = ccompares[i].first;
6075           if (ccompares[i].second)
6076             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
6077                                          cr_index_vector_type,
6078                                          ccompare,
6079                                          indx_before_incr, new_phi_tree);
6080           else
6081             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
6082                                          cr_index_vector_type,
6083                                          ccompare,
6084                                          new_phi_tree, indx_before_incr);
6085         }
6086       gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
6087
6088       /* Update the phi with the vec cond.  */
6089       induction_index = new_phi_tree;
6090       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
6091                    loop_latch_edge (loop), UNKNOWN_LOCATION);
6092     }
6093
6094   /* 2. Create epilog code.
6095         The reduction epilog code operates across the elements of the vector
6096         of partial results computed by the vectorized loop.
6097         The reduction epilog code consists of:
6098
6099         step 1: compute the scalar result in a vector (v_out2)
6100         step 2: extract the scalar result (s_out3) from the vector (v_out2)
6101         step 3: adjust the scalar result (s_out3) if needed.
6102
6103         Step 1 can be accomplished using one the following three schemes:
6104           (scheme 1) using reduc_fn, if available.
6105           (scheme 2) using whole-vector shifts, if available.
6106           (scheme 3) using a scalar loop. In this case steps 1+2 above are
6107                      combined.
6108
6109           The overall epilog code looks like this:
6110
6111           s_out0 = phi <s_loop>         # original EXIT_PHI
6112           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
6113           v_out2 = reduce <v_out1>              # step 1
6114           s_out3 = extract_field <v_out2, 0>    # step 2
6115           s_out4 = adjust_result <s_out3>       # step 3
6116
6117           (step 3 is optional, and steps 1 and 2 may be combined).
6118           Lastly, the uses of s_out0 are replaced by s_out4.  */
6119
6120
6121   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
6122          v_out1 = phi <VECT_DEF>
6123          Store them in NEW_PHIS.  */
6124   if (double_reduc)
6125     loop = outer_loop;
6126   exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
6127   exit_gsi = gsi_after_labels (exit_bb);
6128   reduc_inputs.create (slp_node ? vec_num : ncopies);
6129   for (unsigned i = 0; i < vec_num; i++)
6130     {
6131       gimple_seq stmts = NULL;
6132       if (slp_node)
6133         def = vect_get_slp_vect_def (slp_node, i);
6134       else
6135         def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
6136       for (j = 0; j < ncopies; j++)
6137         {
6138           tree new_def = copy_ssa_name (def);
6139           phi = create_phi_node (new_def, exit_bb);
6140           if (j)
6141             def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
6142           SET_PHI_ARG_DEF (phi, LOOP_VINFO_IV_EXIT (loop_vinfo)->dest_idx, def);
6143           new_def = gimple_convert (&stmts, vectype, new_def);
6144           reduc_inputs.quick_push (new_def);
6145         }
6146       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6147     }
6148
6149   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
6150          (i.e. when reduc_fn is not available) and in the final adjustment
6151          code (if needed).  Also get the original scalar reduction variable as
6152          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
6153          represents a reduction pattern), the tree-code and scalar-def are
6154          taken from the original stmt that the pattern-stmt (STMT) replaces.
6155          Otherwise (it is a regular reduction) - the tree-code and scalar-def
6156          are taken from STMT.  */
6157
6158   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6159   if (orig_stmt_info != stmt_info)
6160     {
6161       /* Reduction pattern  */
6162       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6163       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
6164     }
6165
6166   scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
6167   scalar_type = TREE_TYPE (scalar_dest);
6168   scalar_results.truncate (0);
6169   scalar_results.reserve_exact (group_size);
6170   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
6171   bitsize = TYPE_SIZE (scalar_type);
6172
6173   /* True if we should implement SLP_REDUC using native reduction operations
6174      instead of scalar operations.  */
6175   direct_slp_reduc = (reduc_fn != IFN_LAST
6176                       && slp_reduc
6177                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
6178
6179   /* In case of reduction chain, e.g.,
6180      # a1 = phi <a3, a0>
6181      a2 = operation (a1)
6182      a3 = operation (a2),
6183
6184      we may end up with more than one vector result.  Here we reduce them
6185      to one vector.
6186
6187      The same is true for a SLP reduction, e.g.,
6188      # a1 = phi <a2, a0>
6189      # b1 = phi <b2, b0>
6190      a2 = operation (a1)
6191      b2 = operation (a2),
6192
6193      where we can end up with more than one vector as well.  We can
6194      easily accumulate vectors when the number of vector elements is
6195      a multiple of the SLP group size.
6196
6197      The same is true if we couldn't use a single defuse cycle.  */
6198   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6199       || direct_slp_reduc
6200       || (slp_reduc
6201           && constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype), group_size))
6202       || ncopies > 1)
6203     {
6204       gimple_seq stmts = NULL;
6205       tree single_input = reduc_inputs[0];
6206       for (k = 1; k < reduc_inputs.length (); k++)
6207         single_input = gimple_build (&stmts, code, vectype,
6208                                      single_input, reduc_inputs[k]);
6209       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6210
6211       reduc_inputs.truncate (0);
6212       reduc_inputs.safe_push (single_input);
6213     }
6214
6215   tree orig_reduc_input = reduc_inputs[0];
6216
6217   /* If this loop is an epilogue loop that can be skipped after the
6218      main loop, we can only share a reduction operation between the
6219      main loop and the epilogue if we put it at the target of the
6220      skip edge.
6221
6222      We can still reuse accumulators if this check fails.  Doing so has
6223      the minor(?) benefit of making the epilogue loop's scalar result
6224      independent of the main loop's scalar result.  */
6225   bool unify_with_main_loop_p = false;
6226   if (reduc_info->reused_accumulator
6227       && loop_vinfo->skip_this_loop_edge
6228       && single_succ_p (exit_bb)
6229       && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
6230     {
6231       unify_with_main_loop_p = true;
6232
6233       basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
6234       reduc_inputs[0] = make_ssa_name (vectype);
6235       gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
6236       add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
6237                    UNKNOWN_LOCATION);
6238       add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
6239                    loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
6240       exit_gsi = gsi_after_labels (reduc_block);
6241     }
6242
6243   /* Shouldn't be used beyond this point.  */
6244   exit_bb = nullptr;
6245
6246   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6247       && reduc_fn != IFN_LAST)
6248     {
6249       /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
6250          various data values where the condition matched and another vector
6251          (INDUCTION_INDEX) containing all the indexes of those matches.  We
6252          need to extract the last matching index (which will be the index with
6253          highest value) and use this to index into the data vector.
6254          For the case where there were no matches, the data vector will contain
6255          all default values and the index vector will be all zeros.  */
6256
6257       /* Get various versions of the type of the vector of indexes.  */
6258       tree index_vec_type = TREE_TYPE (induction_index);
6259       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
6260       tree index_scalar_type = TREE_TYPE (index_vec_type);
6261       tree index_vec_cmp_type = truth_type_for (index_vec_type);
6262
6263       /* Get an unsigned integer version of the type of the data vector.  */
6264       int scalar_precision
6265         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6266       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
6267       tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
6268                                                 vectype);
6269
6270       /* First we need to create a vector (ZERO_VEC) of zeros and another
6271          vector (MAX_INDEX_VEC) filled with the last matching index, which we
6272          can create using a MAX reduction and then expanding.
6273          In the case where the loop never made any matches, the max index will
6274          be zero.  */
6275
6276       /* Vector of {0, 0, 0,...}.  */
6277       tree zero_vec = build_zero_cst (vectype);
6278
6279       /* Find maximum value from the vector of found indexes.  */
6280       tree max_index = make_ssa_name (index_scalar_type);
6281       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6282                                                           1, induction_index);
6283       gimple_call_set_lhs (max_index_stmt, max_index);
6284       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
6285
6286       /* Vector of {max_index, max_index, max_index,...}.  */
6287       tree max_index_vec = make_ssa_name (index_vec_type);
6288       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
6289                                                       max_index);
6290       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
6291                                                         max_index_vec_rhs);
6292       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
6293
6294       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
6295          with the vector (INDUCTION_INDEX) of found indexes, choosing values
6296          from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
6297          otherwise.  Only one value should match, resulting in a vector
6298          (VEC_COND) with one data value and the rest zeros.
6299          In the case where the loop never made any matches, every index will
6300          match, resulting in a vector with all data values (which will all be
6301          the default value).  */
6302
6303       /* Compare the max index vector to the vector of found indexes to find
6304          the position of the max value.  */
6305       tree vec_compare = make_ssa_name (index_vec_cmp_type);
6306       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
6307                                                       induction_index,
6308                                                       max_index_vec);
6309       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
6310
6311       /* Use the compare to choose either values from the data vector or
6312          zero.  */
6313       tree vec_cond = make_ssa_name (vectype);
6314       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
6315                                                    vec_compare,
6316                                                    reduc_inputs[0],
6317                                                    zero_vec);
6318       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
6319
6320       /* Finally we need to extract the data value from the vector (VEC_COND)
6321          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
6322          reduction, but because this doesn't exist, we can use a MAX reduction
6323          instead.  The data value might be signed or a float so we need to cast
6324          it first.
6325          In the case where the loop never made any matches, the data values are
6326          all identical, and so will reduce down correctly.  */
6327
6328       /* Make the matched data values unsigned.  */
6329       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
6330       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
6331                                        vec_cond);
6332       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
6333                                                         VIEW_CONVERT_EXPR,
6334                                                         vec_cond_cast_rhs);
6335       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
6336
6337       /* Reduce down to a scalar value.  */
6338       tree data_reduc = make_ssa_name (scalar_type_unsigned);
6339       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6340                                                            1, vec_cond_cast);
6341       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
6342       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
6343
6344       /* Convert the reduced value back to the result type and set as the
6345          result.  */
6346       gimple_seq stmts = NULL;
6347       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
6348                                data_reduc);
6349       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6350       scalar_results.safe_push (new_temp);
6351     }
6352   else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6353            && reduc_fn == IFN_LAST)
6354     {
6355       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
6356          idx = 0;
6357          idx_val = induction_index[0];
6358          val = data_reduc[0];
6359          for (idx = 0, val = init, i = 0; i < nelts; ++i)
6360            if (induction_index[i] > idx_val)
6361              val = data_reduc[i], idx_val = induction_index[i];
6362          return val;  */
6363
6364       tree data_eltype = TREE_TYPE (vectype);
6365       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
6366       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
6367       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
6368       /* Enforced by vectorizable_reduction, which ensures we have target
6369          support before allowing a conditional reduction on variable-length
6370          vectors.  */
6371       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
6372       tree idx_val = NULL_TREE, val = NULL_TREE;
6373       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
6374         {
6375           tree old_idx_val = idx_val;
6376           tree old_val = val;
6377           idx_val = make_ssa_name (idx_eltype);
6378           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
6379                                              build3 (BIT_FIELD_REF, idx_eltype,
6380                                                      induction_index,
6381                                                      bitsize_int (el_size),
6382                                                      bitsize_int (off)));
6383           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6384           val = make_ssa_name (data_eltype);
6385           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
6386                                              build3 (BIT_FIELD_REF,
6387                                                      data_eltype,
6388                                                      reduc_inputs[0],
6389                                                      bitsize_int (el_size),
6390                                                      bitsize_int (off)));
6391           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6392           if (off != 0)
6393             {
6394               tree new_idx_val = idx_val;
6395               if (off != v_size - el_size)
6396                 {
6397                   new_idx_val = make_ssa_name (idx_eltype);
6398                   epilog_stmt = gimple_build_assign (new_idx_val,
6399                                                      MAX_EXPR, idx_val,
6400                                                      old_idx_val);
6401                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6402                 }
6403               tree cond = make_ssa_name (boolean_type_node);
6404               epilog_stmt = gimple_build_assign (cond, GT_EXPR,
6405                                                  idx_val, old_idx_val);
6406               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6407               tree new_val = make_ssa_name (data_eltype);
6408               epilog_stmt = gimple_build_assign (new_val, COND_EXPR,
6409                                                  cond, val, old_val);
6410               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6411               idx_val = new_idx_val;
6412               val = new_val;
6413             }
6414         }
6415       /* Convert the reduced value back to the result type and set as the
6416          result.  */
6417       gimple_seq stmts = NULL;
6418       val = gimple_convert (&stmts, scalar_type, val);
6419       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6420       scalar_results.safe_push (val);
6421     }
6422
6423   /* 2.3 Create the reduction code, using one of the three schemes described
6424          above. In SLP we simply need to extract all the elements from the
6425          vector (without reducing them), so we use scalar shifts.  */
6426   else if (reduc_fn != IFN_LAST && !slp_reduc)
6427     {
6428       tree tmp;
6429       tree vec_elem_type;
6430
6431       /* Case 1:  Create:
6432          v_out2 = reduc_expr <v_out1>  */
6433
6434       if (dump_enabled_p ())
6435         dump_printf_loc (MSG_NOTE, vect_location,
6436                          "Reduce using direct vector reduction.\n");
6437
6438       gimple_seq stmts = NULL;
6439       vec_elem_type = TREE_TYPE (vectype);
6440       new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
6441                                vec_elem_type, reduc_inputs[0]);
6442       new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6443       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6444
6445       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6446           && induc_val)
6447         {
6448           /* Earlier we set the initial value to be a vector if induc_val
6449              values.  Check the result and if it is induc_val then replace
6450              with the original initial value, unless induc_val is
6451              the same as initial_def already.  */
6452           tree zcompare = make_ssa_name (boolean_type_node);
6453           epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
6454                                              new_temp, induc_val);
6455           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6456           tree initial_def = reduc_info->reduc_initial_values[0];
6457           tmp = make_ssa_name (new_scalar_dest);
6458           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6459                                              initial_def, new_temp);
6460           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6461           new_temp = tmp;
6462         }
6463
6464       scalar_results.safe_push (new_temp);
6465     }
6466   else if (direct_slp_reduc)
6467     {
6468       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
6469          with the elements for other SLP statements replaced with the
6470          neutral value.  We can then do a normal reduction on each vector.  */
6471
6472       /* Enforced by vectorizable_reduction.  */
6473       gcc_assert (reduc_inputs.length () == 1);
6474       gcc_assert (pow2p_hwi (group_size));
6475
6476       gimple_seq seq = NULL;
6477
6478       /* Build a vector {0, 1, 2, ...}, with the same number of elements
6479          and the same element size as VECTYPE.  */
6480       tree index = build_index_vector (vectype, 0, 1);
6481       tree index_type = TREE_TYPE (index);
6482       tree index_elt_type = TREE_TYPE (index_type);
6483       tree mask_type = truth_type_for (index_type);
6484
6485       /* Create a vector that, for each element, identifies which of
6486          the REDUC_GROUP_SIZE results should use it.  */
6487       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
6488       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
6489                             build_vector_from_val (index_type, index_mask));
6490
6491       /* Get a neutral vector value.  This is simply a splat of the neutral
6492          scalar value if we have one, otherwise the initial scalar value
6493          is itself a neutral value.  */
6494       tree vector_identity = NULL_TREE;
6495       tree neutral_op = NULL_TREE;
6496       if (slp_node)
6497         {
6498           tree initial_value = NULL_TREE;
6499           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6500             initial_value = reduc_info->reduc_initial_values[0];
6501           neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
6502                                                  initial_value, false);
6503         }
6504       if (neutral_op)
6505         vector_identity = gimple_build_vector_from_val (&seq, vectype,
6506                                                         neutral_op);
6507       for (unsigned int i = 0; i < group_size; ++i)
6508         {
6509           /* If there's no univeral neutral value, we can use the
6510              initial scalar value from the original PHI.  This is used
6511              for MIN and MAX reduction, for example.  */
6512           if (!neutral_op)
6513             {
6514               tree scalar_value = reduc_info->reduc_initial_values[i];
6515               scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
6516                                              scalar_value);
6517               vector_identity = gimple_build_vector_from_val (&seq, vectype,
6518                                                               scalar_value);
6519             }
6520
6521           /* Calculate the equivalent of:
6522
6523              sel[j] = (index[j] == i);
6524
6525              which selects the elements of REDUC_INPUTS[0] that should
6526              be included in the result.  */
6527           tree compare_val = build_int_cst (index_elt_type, i);
6528           compare_val = build_vector_from_val (index_type, compare_val);
6529           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
6530                                    index, compare_val);
6531
6532           /* Calculate the equivalent of:
6533
6534              vec = seq ? reduc_inputs[0] : vector_identity;
6535
6536              VEC is now suitable for a full vector reduction.  */
6537           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
6538                                    sel, reduc_inputs[0], vector_identity);
6539
6540           /* Do the reduction and convert it to the appropriate type.  */
6541           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
6542                                       TREE_TYPE (vectype), vec);
6543           scalar = gimple_convert (&seq, scalar_type, scalar);
6544           scalar_results.safe_push (scalar);
6545         }
6546       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
6547     }
6548   else
6549     {
6550       bool reduce_with_shift;
6551       tree vec_temp;
6552
6553       gcc_assert (slp_reduc || reduc_inputs.length () == 1);
6554
6555       /* See if the target wants to do the final (shift) reduction
6556          in a vector mode of smaller size and first reduce upper/lower
6557          halves against each other.  */
6558       enum machine_mode mode1 = mode;
6559       tree stype = TREE_TYPE (vectype);
6560       unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6561       unsigned nunits1 = nunits;
6562       if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
6563           && reduc_inputs.length () == 1)
6564         {
6565           nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6566           /* For SLP reductions we have to make sure lanes match up, but
6567              since we're doing individual element final reduction reducing
6568              vector width here is even more important.
6569              ???  We can also separate lanes with permutes, for the common
6570              case of power-of-two group-size odd/even extracts would work.  */
6571           if (slp_reduc && nunits != nunits1)
6572             {
6573               nunits1 = least_common_multiple (nunits1, group_size);
6574               gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
6575             }
6576         }
6577       if (!slp_reduc
6578           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
6579         nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6580
6581       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
6582                                                            stype, nunits1);
6583       reduce_with_shift = have_whole_vector_shift (mode1);
6584       if (!VECTOR_MODE_P (mode1)
6585           || !directly_supported_p (code, vectype1))
6586         reduce_with_shift = false;
6587
6588       /* First reduce the vector to the desired vector size we should
6589          do shift reduction on by combining upper and lower halves.  */
6590       gimple_seq stmts = NULL;
6591       new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
6592                                              code, &stmts);
6593       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6594       reduc_inputs[0] = new_temp;
6595
6596       if (reduce_with_shift && !slp_reduc)
6597         {
6598           int element_bitsize = tree_to_uhwi (bitsize);
6599           /* Enforced by vectorizable_reduction, which disallows SLP reductions
6600              for variable-length vectors and also requires direct target support
6601              for loop reductions.  */
6602           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6603           int nelements = vec_size_in_bits / element_bitsize;
6604           vec_perm_builder sel;
6605           vec_perm_indices indices;
6606
6607           int elt_offset;
6608
6609           tree zero_vec = build_zero_cst (vectype1);
6610           /* Case 2: Create:
6611              for (offset = nelements/2; offset >= 1; offset/=2)
6612                 {
6613                   Create:  va' = vec_shift <va, offset>
6614                   Create:  va = vop <va, va'>
6615                 }  */
6616
6617           tree rhs;
6618
6619           if (dump_enabled_p ())
6620             dump_printf_loc (MSG_NOTE, vect_location,
6621                              "Reduce using vector shifts\n");
6622
6623           gimple_seq stmts = NULL;
6624           new_temp = gimple_convert (&stmts, vectype1, new_temp);
6625           for (elt_offset = nelements / 2;
6626                elt_offset >= 1;
6627                elt_offset /= 2)
6628             {
6629               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
6630               indices.new_vector (sel, 2, nelements);
6631               tree mask = vect_gen_perm_mask_any (vectype1, indices);
6632               new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
6633                                        new_temp, zero_vec, mask);
6634               new_temp = gimple_build (&stmts, code,
6635                                        vectype1, new_name, new_temp);
6636             }
6637           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6638
6639           /* 2.4  Extract the final scalar result.  Create:
6640              s_out3 = extract_field <v_out2, bitpos>  */
6641
6642           if (dump_enabled_p ())
6643             dump_printf_loc (MSG_NOTE, vect_location,
6644                              "extract scalar result\n");
6645
6646           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
6647                         bitsize, bitsize_zero_node);
6648           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
6649           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
6650           gimple_assign_set_lhs (epilog_stmt, new_temp);
6651           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6652           scalar_results.safe_push (new_temp);
6653         }
6654       else
6655         {
6656           /* Case 3: Create:
6657              s = extract_field <v_out2, 0>
6658              for (offset = element_size;
6659                   offset < vector_size;
6660                   offset += element_size;)
6661                {
6662                  Create:  s' = extract_field <v_out2, offset>
6663                  Create:  s = op <s, s'>  // For non SLP cases
6664                }  */
6665
6666           if (dump_enabled_p ())
6667             dump_printf_loc (MSG_NOTE, vect_location,
6668                              "Reduce using scalar code.\n");
6669
6670           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6671           int element_bitsize = tree_to_uhwi (bitsize);
6672           tree compute_type = TREE_TYPE (vectype);
6673           gimple_seq stmts = NULL;
6674           FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
6675             {
6676               int bit_offset;
6677               new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
6678                                        vec_temp, bitsize, bitsize_zero_node);
6679
6680               /* In SLP we don't need to apply reduction operation, so we just
6681                  collect s' values in SCALAR_RESULTS.  */
6682               if (slp_reduc)
6683                 scalar_results.safe_push (new_temp);
6684
6685               for (bit_offset = element_bitsize;
6686                    bit_offset < vec_size_in_bits;
6687                    bit_offset += element_bitsize)
6688                 {
6689                   tree bitpos = bitsize_int (bit_offset);
6690                   new_name = gimple_build (&stmts, BIT_FIELD_REF,
6691                                            compute_type, vec_temp,
6692                                            bitsize, bitpos);
6693                   if (slp_reduc)
6694                     {
6695                       /* In SLP we don't need to apply reduction operation, so
6696                          we just collect s' values in SCALAR_RESULTS.  */
6697                       new_temp = new_name;
6698                       scalar_results.safe_push (new_name);
6699                     }
6700                   else
6701                     new_temp = gimple_build (&stmts, code, compute_type,
6702                                              new_name, new_temp);
6703                 }
6704             }
6705
6706           /* The only case where we need to reduce scalar results in SLP, is
6707              unrolling.  If the size of SCALAR_RESULTS is greater than
6708              REDUC_GROUP_SIZE, we reduce them combining elements modulo
6709              REDUC_GROUP_SIZE.  */
6710           if (slp_reduc)
6711             {
6712               tree res, first_res, new_res;
6713
6714               /* Reduce multiple scalar results in case of SLP unrolling.  */
6715               for (j = group_size; scalar_results.iterate (j, &res);
6716                    j++)
6717                 {
6718                   first_res = scalar_results[j % group_size];
6719                   new_res = gimple_build (&stmts, code, compute_type,
6720                                           first_res, res);
6721                   scalar_results[j % group_size] = new_res;
6722                 }
6723               scalar_results.truncate (group_size);
6724               for (k = 0; k < group_size; k++)
6725                 scalar_results[k] = gimple_convert (&stmts, scalar_type,
6726                                                     scalar_results[k]);
6727             }
6728           else
6729             {
6730               /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
6731               new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6732               scalar_results.safe_push (new_temp);
6733             }
6734
6735           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6736         }
6737
6738       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6739           && induc_val)
6740         {
6741           /* Earlier we set the initial value to be a vector if induc_val
6742              values.  Check the result and if it is induc_val then replace
6743              with the original initial value, unless induc_val is
6744              the same as initial_def already.  */
6745           tree zcompare = make_ssa_name (boolean_type_node);
6746           epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR, new_temp,
6747                                              induc_val);
6748           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6749           tree initial_def = reduc_info->reduc_initial_values[0];
6750           tree tmp = make_ssa_name (new_scalar_dest);
6751           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6752                                              initial_def, new_temp);
6753           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6754           scalar_results[0] = tmp;
6755         }
6756     }
6757
6758   /* 2.5 Adjust the final result by the initial value of the reduction
6759          variable. (When such adjustment is not needed, then
6760          'adjustment_def' is zero).  For example, if code is PLUS we create:
6761          new_temp = loop_exit_def + adjustment_def  */
6762
6763   if (adjustment_def)
6764     {
6765       gcc_assert (!slp_reduc);
6766       gimple_seq stmts = NULL;
6767       if (double_reduc)
6768         {
6769           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6770           adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6771           new_temp = gimple_build (&stmts, code, vectype,
6772                                    reduc_inputs[0], adjustment_def);
6773         }
6774       else
6775         {
6776           new_temp = scalar_results[0];
6777           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6778           adjustment_def = gimple_convert (&stmts, TREE_TYPE (vectype),
6779                                            adjustment_def);
6780           new_temp = gimple_convert (&stmts, TREE_TYPE (vectype), new_temp);
6781           new_temp = gimple_build (&stmts, code, TREE_TYPE (vectype),
6782                                    new_temp, adjustment_def);
6783           new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6784         }
6785
6786       epilog_stmt = gimple_seq_last_stmt (stmts);
6787       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6788       scalar_results[0] = new_temp;
6789     }
6790
6791   /* Record this operation if it could be reused by the epilogue loop.  */
6792   if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION
6793       && reduc_inputs.length () == 1)
6794     loop_vinfo->reusable_accumulators.put (scalar_results[0],
6795                                            { orig_reduc_input, reduc_info });
6796
6797   if (double_reduc)
6798     loop = outer_loop;
6799
6800   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
6801           phis with new adjusted scalar results, i.e., replace use <s_out0>
6802           with use <s_out4>.
6803
6804      Transform:
6805         loop_exit:
6806           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6807           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6808           v_out2 = reduce <v_out1>
6809           s_out3 = extract_field <v_out2, 0>
6810           s_out4 = adjust_result <s_out3>
6811           use <s_out0>
6812           use <s_out0>
6813
6814      into:
6815
6816         loop_exit:
6817           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6818           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6819           v_out2 = reduce <v_out1>
6820           s_out3 = extract_field <v_out2, 0>
6821           s_out4 = adjust_result <s_out3>
6822           use <s_out4>
6823           use <s_out4> */
6824
6825   gcc_assert (live_out_stmts.size () == scalar_results.length ());
6826   for (k = 0; k < live_out_stmts.size (); k++)
6827     {
6828       stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6829       scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6830
6831       phis.create (3);
6832       /* Find the loop-closed-use at the loop exit of the original scalar
6833          result.  (The reduction result is expected to have two immediate uses,
6834          one at the latch block, and one at the loop exit).  For double
6835          reductions we are looking for exit phis of the outer loop.  */
6836       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6837         {
6838           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6839             {
6840               if (!is_gimple_debug (USE_STMT (use_p)))
6841                 phis.safe_push (USE_STMT (use_p));
6842             }
6843           else
6844             {
6845               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6846                 {
6847                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
6848
6849                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6850                     {
6851                       if (!flow_bb_inside_loop_p (loop,
6852                                              gimple_bb (USE_STMT (phi_use_p)))
6853                           && !is_gimple_debug (USE_STMT (phi_use_p)))
6854                         phis.safe_push (USE_STMT (phi_use_p));
6855                     }
6856                 }
6857             }
6858         }
6859
6860       FOR_EACH_VEC_ELT (phis, i, exit_phi)
6861         {
6862           /* Replace the uses:  */
6863           orig_name = PHI_RESULT (exit_phi);
6864
6865           /* Look for a single use at the target of the skip edge.  */
6866           if (unify_with_main_loop_p)
6867             {
6868               use_operand_p use_p;
6869               gimple *user;
6870               if (!single_imm_use (orig_name, &use_p, &user))
6871                 gcc_unreachable ();
6872               orig_name = gimple_get_lhs (user);
6873             }
6874
6875           scalar_result = scalar_results[k];
6876           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6877             {
6878               FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6879                 SET_USE (use_p, scalar_result);
6880               update_stmt (use_stmt);
6881             }
6882         }
6883
6884       phis.release ();
6885     }
6886 }
6887
6888 /* Return a vector of type VECTYPE that is equal to the vector select
6889    operation "MASK ? VEC : IDENTITY".  Insert the select statements
6890    before GSI.  */
6891
6892 static tree
6893 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6894                      tree vec, tree identity)
6895 {
6896   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6897   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6898                                           mask, vec, identity);
6899   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6900   return cond;
6901 }
6902
6903 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6904    order, starting with LHS.  Insert the extraction statements before GSI and
6905    associate the new scalar SSA names with variable SCALAR_DEST.
6906    Return the SSA name for the result.  */
6907
6908 static tree
6909 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6910                        tree_code code, tree lhs, tree vector_rhs)
6911 {
6912   tree vectype = TREE_TYPE (vector_rhs);
6913   tree scalar_type = TREE_TYPE (vectype);
6914   tree bitsize = TYPE_SIZE (scalar_type);
6915   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6916   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6917
6918   for (unsigned HOST_WIDE_INT bit_offset = 0;
6919        bit_offset < vec_size_in_bits;
6920        bit_offset += element_bitsize)
6921     {
6922       tree bitpos = bitsize_int (bit_offset);
6923       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6924                          bitsize, bitpos);
6925
6926       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6927       rhs = make_ssa_name (scalar_dest, stmt);
6928       gimple_assign_set_lhs (stmt, rhs);
6929       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6930
6931       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6932       tree new_name = make_ssa_name (scalar_dest, stmt);
6933       gimple_assign_set_lhs (stmt, new_name);
6934       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6935       lhs = new_name;
6936     }
6937   return lhs;
6938 }
6939
6940 /* Get a masked internal function equivalent to REDUC_FN.  VECTYPE_IN is the
6941    type of the vector input.  */
6942
6943 static internal_fn
6944 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6945 {
6946   internal_fn mask_reduc_fn;
6947   internal_fn mask_len_reduc_fn;
6948
6949   switch (reduc_fn)
6950     {
6951     case IFN_FOLD_LEFT_PLUS:
6952       mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6953       mask_len_reduc_fn = IFN_MASK_LEN_FOLD_LEFT_PLUS;
6954       break;
6955
6956     default:
6957       return IFN_LAST;
6958     }
6959
6960   if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
6961                                       OPTIMIZE_FOR_SPEED))
6962     return mask_reduc_fn;
6963   if (direct_internal_fn_supported_p (mask_len_reduc_fn, vectype_in,
6964                                       OPTIMIZE_FOR_SPEED))
6965     return mask_len_reduc_fn;
6966   return IFN_LAST;
6967 }
6968
6969 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
6970    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
6971    statement.  CODE is the operation performed by STMT_INFO and OPS are
6972    its scalar operands.  REDUC_INDEX is the index of the operand in
6973    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
6974    implements in-order reduction, or IFN_LAST if we should open-code it.
6975    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
6976    that should be used to control the operation in a fully-masked loop.  */
6977
6978 static bool
6979 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
6980                                stmt_vec_info stmt_info,
6981                                gimple_stmt_iterator *gsi,
6982                                gimple **vec_stmt, slp_tree slp_node,
6983                                gimple *reduc_def_stmt,
6984                                code_helper code, internal_fn reduc_fn,
6985                                tree *ops, int num_ops, tree vectype_in,
6986                                int reduc_index, vec_loop_masks *masks,
6987                                vec_loop_lens *lens)
6988 {
6989   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6990   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6991   internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
6992
6993   int ncopies;
6994   if (slp_node)
6995     ncopies = 1;
6996   else
6997     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6998
6999   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
7000   gcc_assert (ncopies == 1);
7001
7002   bool is_cond_op = false;
7003   if (!code.is_tree_code ())
7004     {
7005       code = conditional_internal_fn_code (internal_fn (code));
7006       gcc_assert (code != ERROR_MARK);
7007       is_cond_op = true;
7008     }
7009
7010   gcc_assert (TREE_CODE_LENGTH (tree_code (code)) == binary_op);
7011
7012   if (slp_node)
7013     {
7014       if (is_cond_op)
7015         {
7016           if (dump_enabled_p ())
7017             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7018                              "fold-left reduction on SLP not supported.\n");
7019           return false;
7020         }
7021
7022       gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
7023                             TYPE_VECTOR_SUBPARTS (vectype_in)));
7024     }
7025
7026   /* The operands either come from a binary operation or an IFN_COND operation.
7027      The former is a gimple assign with binary rhs and the latter is a
7028      gimple call with four arguments.  */
7029   gcc_assert (num_ops == 2 || num_ops == 4);
7030   tree op0, opmask;
7031   if (!is_cond_op)
7032     op0 = ops[1 - reduc_index];
7033   else
7034     {
7035       op0 = ops[2];
7036       opmask = ops[0];
7037       gcc_assert (!slp_node);
7038     }
7039
7040   int group_size = 1;
7041   stmt_vec_info scalar_dest_def_info;
7042   auto_vec<tree> vec_oprnds0, vec_opmask;
7043   if (slp_node)
7044     {
7045       auto_vec<vec<tree> > vec_defs (2);
7046       vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
7047       vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
7048       vec_defs[0].release ();
7049       vec_defs[1].release ();
7050       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7051       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
7052     }
7053   else
7054     {
7055       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7056                                      op0, &vec_oprnds0);
7057       scalar_dest_def_info = stmt_info;
7058
7059       /* For an IFN_COND_OP we also need the vector mask operand.  */
7060       if (is_cond_op)
7061           vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7062                                          opmask, &vec_opmask);
7063     }
7064
7065   gimple *sdef = scalar_dest_def_info->stmt;
7066   tree scalar_dest = gimple_get_lhs (sdef);
7067   tree scalar_type = TREE_TYPE (scalar_dest);
7068   tree reduc_var = gimple_phi_result (reduc_def_stmt);
7069
7070   int vec_num = vec_oprnds0.length ();
7071   gcc_assert (vec_num == 1 || slp_node);
7072   tree vec_elem_type = TREE_TYPE (vectype_out);
7073   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
7074
7075   tree vector_identity = NULL_TREE;
7076   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7077     {
7078       vector_identity = build_zero_cst (vectype_out);
7079       if (!HONOR_SIGNED_ZEROS (vectype_out))
7080         ;
7081       else
7082         {
7083           gcc_assert (!HONOR_SIGN_DEPENDENT_ROUNDING (vectype_out));
7084           vector_identity = const_unop (NEGATE_EXPR, vectype_out,
7085                                         vector_identity);
7086         }
7087     }
7088
7089   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
7090   int i;
7091   tree def0;
7092   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7093     {
7094       gimple *new_stmt;
7095       tree mask = NULL_TREE;
7096       tree len = NULL_TREE;
7097       tree bias = NULL_TREE;
7098       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7099         mask = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num, vectype_in, i);
7100       else if (is_cond_op)
7101         mask = vec_opmask[0];
7102       if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
7103         {
7104           len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
7105                                    i, 1);
7106           signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
7107           bias = build_int_cst (intQI_type_node, biasval);
7108           if (!is_cond_op)
7109             mask = build_minus_one_cst (truth_type_for (vectype_in));
7110         }
7111
7112       /* Handle MINUS by adding the negative.  */
7113       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
7114         {
7115           tree negated = make_ssa_name (vectype_out);
7116           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
7117           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
7118           def0 = negated;
7119         }
7120
7121       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
7122           && mask && mask_reduc_fn == IFN_LAST)
7123         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
7124                                     vector_identity);
7125
7126       /* On the first iteration the input is simply the scalar phi
7127          result, and for subsequent iterations it is the output of
7128          the preceding operation.  */
7129       if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
7130         {
7131           if (mask && len && mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
7132             new_stmt = gimple_build_call_internal (mask_reduc_fn, 5, reduc_var,
7133                                                    def0, mask, len, bias);
7134           else if (mask && mask_reduc_fn == IFN_MASK_FOLD_LEFT_PLUS)
7135             new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
7136                                                    def0, mask);
7137           else
7138             new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
7139                                                    def0);
7140           /* For chained SLP reductions the output of the previous reduction
7141              operation serves as the input of the next. For the final statement
7142              the output cannot be a temporary - we reuse the original
7143              scalar destination of the last statement.  */
7144           if (i != vec_num - 1)
7145             {
7146               gimple_set_lhs (new_stmt, scalar_dest_var);
7147               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
7148               gimple_set_lhs (new_stmt, reduc_var);
7149             }
7150         }
7151       else
7152         {
7153           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var,
7154                                              tree_code (code), reduc_var, def0);
7155           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
7156           /* Remove the statement, so that we can use the same code paths
7157              as for statements that we've just created.  */
7158           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
7159           gsi_remove (&tmp_gsi, true);
7160         }
7161
7162       if (i == vec_num - 1)
7163         {
7164           gimple_set_lhs (new_stmt, scalar_dest);
7165           vect_finish_replace_stmt (loop_vinfo,
7166                                     scalar_dest_def_info,
7167                                     new_stmt);
7168         }
7169       else
7170         vect_finish_stmt_generation (loop_vinfo,
7171                                      scalar_dest_def_info,
7172                                      new_stmt, gsi);
7173
7174       if (slp_node)
7175         slp_node->push_vec_def (new_stmt);
7176       else
7177         {
7178           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7179           *vec_stmt = new_stmt;
7180         }
7181     }
7182
7183   return true;
7184 }
7185
7186 /* Function is_nonwrapping_integer_induction.
7187
7188    Check if STMT_VINO (which is part of loop LOOP) both increments and
7189    does not cause overflow.  */
7190
7191 static bool
7192 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
7193 {
7194   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
7195   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
7196   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
7197   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
7198   widest_int ni, max_loop_value, lhs_max;
7199   wi::overflow_type overflow = wi::OVF_NONE;
7200
7201   /* Make sure the loop is integer based.  */
7202   if (TREE_CODE (base) != INTEGER_CST
7203       || TREE_CODE (step) != INTEGER_CST)
7204     return false;
7205
7206   /* Check that the max size of the loop will not wrap.  */
7207
7208   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
7209     return true;
7210
7211   if (! max_stmt_executions (loop, &ni))
7212     return false;
7213
7214   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
7215                             &overflow);
7216   if (overflow)
7217     return false;
7218
7219   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
7220                             TYPE_SIGN (lhs_type), &overflow);
7221   if (overflow)
7222     return false;
7223
7224   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
7225           <= TYPE_PRECISION (lhs_type));
7226 }
7227
7228 /* Check if masking can be supported by inserting a conditional expression.
7229    CODE is the code for the operation.  COND_FN is the conditional internal
7230    function, if it exists.  VECTYPE_IN is the type of the vector input.  */
7231 static bool
7232 use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
7233                          tree vectype_in)
7234 {
7235   if (cond_fn != IFN_LAST
7236       && direct_internal_fn_supported_p (cond_fn, vectype_in,
7237                                          OPTIMIZE_FOR_SPEED))
7238     return false;
7239
7240   if (code.is_tree_code ())
7241     switch (tree_code (code))
7242       {
7243       case DOT_PROD_EXPR:
7244       case SAD_EXPR:
7245         return true;
7246
7247       default:
7248         break;
7249       }
7250   return false;
7251 }
7252
7253 /* Insert a conditional expression to enable masked vectorization.  CODE is the
7254    code for the operation.  VOP is the array of operands.  MASK is the loop
7255    mask.  GSI is a statement iterator used to place the new conditional
7256    expression.  */
7257 static void
7258 build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
7259                       gimple_stmt_iterator *gsi)
7260 {
7261   switch (tree_code (code))
7262     {
7263     case DOT_PROD_EXPR:
7264       {
7265         tree vectype = TREE_TYPE (vop[1]);
7266         tree zero = build_zero_cst (vectype);
7267         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7268         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7269                                                mask, vop[1], zero);
7270         gsi_insert_before (gsi, select, GSI_SAME_STMT);
7271         vop[1] = masked_op1;
7272         break;
7273       }
7274
7275     case SAD_EXPR:
7276       {
7277         tree vectype = TREE_TYPE (vop[1]);
7278         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7279         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7280                                                mask, vop[1], vop[0]);
7281         gsi_insert_before (gsi, select, GSI_SAME_STMT);
7282         vop[1] = masked_op1;
7283         break;
7284       }
7285
7286     default:
7287       gcc_unreachable ();
7288     }
7289 }
7290
7291 /* Function vectorizable_reduction.
7292
7293    Check if STMT_INFO performs a reduction operation that can be vectorized.
7294    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
7295    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
7296    Return true if STMT_INFO is vectorizable in this way.
7297
7298    This function also handles reduction idioms (patterns) that have been
7299    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
7300    may be of this form:
7301      X = pattern_expr (arg0, arg1, ..., X)
7302    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
7303    sequence that had been detected and replaced by the pattern-stmt
7304    (STMT_INFO).
7305
7306    This function also handles reduction of condition expressions, for example:
7307      for (int i = 0; i < N; i++)
7308        if (a[i] < value)
7309          last = a[i];
7310    This is handled by vectorising the loop and creating an additional vector
7311    containing the loop indexes for which "a[i] < value" was true.  In the
7312    function epilogue this is reduced to a single max value and then used to
7313    index into the vector of results.
7314
7315    In some cases of reduction patterns, the type of the reduction variable X is
7316    different than the type of the other arguments of STMT_INFO.
7317    In such cases, the vectype that is used when transforming STMT_INFO into
7318    a vector stmt is different than the vectype that is used to determine the
7319    vectorization factor, because it consists of a different number of elements
7320    than the actual number of elements that are being operated upon in parallel.
7321
7322    For example, consider an accumulation of shorts into an int accumulator.
7323    On some targets it's possible to vectorize this pattern operating on 8
7324    shorts at a time (hence, the vectype for purposes of determining the
7325    vectorization factor should be V8HI); on the other hand, the vectype that
7326    is used to create the vector form is actually V4SI (the type of the result).
7327
7328    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
7329    indicates what is the actual level of parallelism (V8HI in the example), so
7330    that the right vectorization factor would be derived.  This vectype
7331    corresponds to the type of arguments to the reduction stmt, and should *NOT*
7332    be used to create the vectorized stmt.  The right vectype for the vectorized
7333    stmt is obtained from the type of the result X:
7334       get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7335
7336    This means that, contrary to "regular" reductions (or "regular" stmts in
7337    general), the following equation:
7338       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7339    does *NOT* necessarily hold for reduction patterns.  */
7340
7341 bool
7342 vectorizable_reduction (loop_vec_info loop_vinfo,
7343                         stmt_vec_info stmt_info, slp_tree slp_node,
7344                         slp_instance slp_node_instance,
7345                         stmt_vector_for_cost *cost_vec)
7346 {
7347   tree vectype_in = NULL_TREE;
7348   tree vectype_op[3] = { NULL_TREE, NULL_TREE, NULL_TREE };
7349   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7350   enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
7351   stmt_vec_info cond_stmt_vinfo = NULL;
7352   int i;
7353   int ncopies;
7354   bool single_defuse_cycle = false;
7355   bool nested_cycle = false;
7356   bool double_reduc = false;
7357   int vec_num;
7358   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
7359   tree cond_reduc_val = NULL_TREE;
7360
7361   /* Make sure it was already recognized as a reduction computation.  */
7362   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
7363       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
7364       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
7365     return false;
7366
7367   /* The stmt we store reduction analysis meta on.  */
7368   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7369   reduc_info->is_reduc_info = true;
7370
7371   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
7372     {
7373       if (is_a <gphi *> (stmt_info->stmt))
7374         {
7375           if (slp_node)
7376             {
7377               /* We eventually need to set a vector type on invariant
7378                  arguments.  */
7379               unsigned j;
7380               slp_tree child;
7381               FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
7382                 if (!vect_maybe_update_slp_op_vectype
7383                        (child, SLP_TREE_VECTYPE (slp_node)))
7384                   {
7385                     if (dump_enabled_p ())
7386                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7387                                        "incompatible vector types for "
7388                                        "invariants\n");
7389                     return false;
7390                   }
7391             }
7392           /* Analysis for double-reduction is done on the outer
7393              loop PHI, nested cycles have no further restrictions.  */
7394           STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
7395         }
7396       else
7397         STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7398       return true;
7399     }
7400
7401   stmt_vec_info orig_stmt_of_analysis = stmt_info;
7402   stmt_vec_info phi_info = stmt_info;
7403   if (!is_a <gphi *> (stmt_info->stmt))
7404     {
7405       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7406       return true;
7407     }
7408   if (slp_node)
7409     {
7410       slp_node_instance->reduc_phis = slp_node;
7411       /* ???  We're leaving slp_node to point to the PHIs, we only
7412          need it to get at the number of vector stmts which wasn't
7413          yet initialized for the instance root.  */
7414     }
7415   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7416     {
7417       use_operand_p use_p;
7418       gimple *use_stmt;
7419       bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
7420                                  &use_p, &use_stmt);
7421       gcc_assert (res);
7422       phi_info = loop_vinfo->lookup_stmt (use_stmt);
7423     }
7424
7425   /* PHIs should not participate in patterns.  */
7426   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7427   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7428
7429   /* Verify following REDUC_IDX from the latch def leads us back to the PHI
7430      and compute the reduction chain length.  Discover the real
7431      reduction operation stmt on the way (stmt_info and slp_for_stmt_info).  */
7432   tree reduc_def
7433     = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
7434                              loop_latch_edge
7435                                (gimple_bb (reduc_def_phi)->loop_father));
7436   unsigned reduc_chain_length = 0;
7437   bool only_slp_reduc_chain = true;
7438   stmt_info = NULL;
7439   slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
7440   while (reduc_def != PHI_RESULT (reduc_def_phi))
7441     {
7442       stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
7443       stmt_vec_info vdef = vect_stmt_to_vectorize (def);
7444       if (STMT_VINFO_REDUC_IDX (vdef) == -1)
7445         {
7446           if (dump_enabled_p ())
7447             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7448                              "reduction chain broken by patterns.\n");
7449           return false;
7450         }
7451       if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
7452         only_slp_reduc_chain = false;
7453       /* For epilogue generation live members of the chain need
7454          to point back to the PHI via their original stmt for
7455          info_for_reduction to work.  For SLP we need to look at
7456          all lanes here - even though we only will vectorize from
7457          the SLP node with live lane zero the other live lanes also
7458          need to be identified as part of a reduction to be able
7459          to skip code generation for them.  */
7460       if (slp_for_stmt_info)
7461         {
7462           for (auto s : SLP_TREE_SCALAR_STMTS (slp_for_stmt_info))
7463             if (STMT_VINFO_LIVE_P (s))
7464               STMT_VINFO_REDUC_DEF (vect_orig_stmt (s)) = phi_info;
7465         }
7466       else if (STMT_VINFO_LIVE_P (vdef))
7467         STMT_VINFO_REDUC_DEF (def) = phi_info;
7468       gimple_match_op op;
7469       if (!gimple_extract_op (vdef->stmt, &op))
7470         {
7471           if (dump_enabled_p ())
7472             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7473                              "reduction chain includes unsupported"
7474                              " statement type.\n");
7475           return false;
7476         }
7477       if (CONVERT_EXPR_CODE_P (op.code))
7478         {
7479           if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
7480             {
7481               if (dump_enabled_p ())
7482                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7483                                  "conversion in the reduction chain.\n");
7484               return false;
7485             }
7486         }
7487       else if (!stmt_info)
7488         /* First non-conversion stmt.  */
7489         stmt_info = vdef;
7490       reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
7491       reduc_chain_length++;
7492       if (!stmt_info && slp_node)
7493         slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
7494     }
7495   /* PHIs should not participate in patterns.  */
7496   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7497
7498   if (nested_in_vect_loop_p (loop, stmt_info))
7499     {
7500       loop = loop->inner;
7501       nested_cycle = true;
7502     }
7503
7504   /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
7505      element.  */
7506   if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7507     {
7508       gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
7509       stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
7510     }
7511   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7512     gcc_assert (slp_node
7513                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
7514
7515   /* 1. Is vectorizable reduction?  */
7516   /* Not supportable if the reduction variable is used in the loop, unless
7517      it's a reduction chain.  */
7518   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
7519       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7520     return false;
7521
7522   /* Reductions that are not used even in an enclosing outer-loop,
7523      are expected to be "live" (used out of the loop).  */
7524   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
7525       && !STMT_VINFO_LIVE_P (stmt_info))
7526     return false;
7527
7528   /* 2. Has this been recognized as a reduction pattern?
7529
7530      Check if STMT represents a pattern that has been recognized
7531      in earlier analysis stages.  For stmts that represent a pattern,
7532      the STMT_VINFO_RELATED_STMT field records the last stmt in
7533      the original sequence that constitutes the pattern.  */
7534
7535   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
7536   if (orig_stmt_info)
7537     {
7538       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
7539       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
7540     }
7541
7542   /* 3. Check the operands of the operation.  The first operands are defined
7543         inside the loop body. The last operand is the reduction variable,
7544         which is defined by the loop-header-phi.  */
7545
7546   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7547   STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
7548   gimple_match_op op;
7549   if (!gimple_extract_op (stmt_info->stmt, &op))
7550     gcc_unreachable ();
7551   bool lane_reduc_code_p = (op.code == DOT_PROD_EXPR
7552                             || op.code == WIDEN_SUM_EXPR
7553                             || op.code == SAD_EXPR);
7554
7555   if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
7556       && !SCALAR_FLOAT_TYPE_P (op.type))
7557     return false;
7558
7559   /* Do not try to vectorize bit-precision reductions.  */
7560   if (!type_has_mode_precision_p (op.type))
7561     return false;
7562
7563   /* For lane-reducing ops we're reducing the number of reduction PHIs
7564      which means the only use of that may be in the lane-reducing operation.  */
7565   if (lane_reduc_code_p
7566       && reduc_chain_length != 1
7567       && !only_slp_reduc_chain)
7568     {
7569       if (dump_enabled_p ())
7570         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7571                          "lane-reducing reduction with extra stmts.\n");
7572       return false;
7573     }
7574
7575   /* All uses but the last are expected to be defined in the loop.
7576      The last use is the reduction variable.  In case of nested cycle this
7577      assumption is not true: we use reduc_index to record the index of the
7578      reduction variable.  */
7579   slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
7580   /* We need to skip an extra operand for COND_EXPRs with embedded
7581      comparison.  */
7582   unsigned opno_adjust = 0;
7583   if (op.code == COND_EXPR && COMPARISON_CLASS_P (op.ops[0]))
7584     opno_adjust = 1;
7585   for (i = 0; i < (int) op.num_ops; i++)
7586     {
7587       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
7588       if (i == 0 && op.code == COND_EXPR)
7589         continue;
7590
7591       stmt_vec_info def_stmt_info;
7592       enum vect_def_type dt;
7593       if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
7594                                i + opno_adjust, &op.ops[i], &slp_op[i], &dt,
7595                                &vectype_op[i], &def_stmt_info))
7596         {
7597           if (dump_enabled_p ())
7598             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7599                              "use not simple.\n");
7600           return false;
7601         }
7602       if (i == STMT_VINFO_REDUC_IDX (stmt_info))
7603         continue;
7604
7605       /* For an IFN_COND_OP we might hit the reduction definition operand
7606          twice (once as definition, once as else).  */
7607       if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
7608         continue;
7609
7610       /* There should be only one cycle def in the stmt, the one
7611          leading to reduc_def.  */
7612       if (VECTORIZABLE_CYCLE_DEF (dt))
7613         return false;
7614
7615       if (!vectype_op[i])
7616         vectype_op[i]
7617           = get_vectype_for_scalar_type (loop_vinfo,
7618                                          TREE_TYPE (op.ops[i]), slp_op[i]);
7619
7620       /* To properly compute ncopies we are interested in the widest
7621          non-reduction input type in case we're looking at a widening
7622          accumulation that we later handle in vect_transform_reduction.  */
7623       if (lane_reduc_code_p
7624           && vectype_op[i]
7625           && (!vectype_in
7626               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
7627                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_op[i]))))))
7628         vectype_in = vectype_op[i];
7629
7630       if (op.code == COND_EXPR)
7631         {
7632           /* Record how the non-reduction-def value of COND_EXPR is defined.  */
7633           if (dt == vect_constant_def)
7634             {
7635               cond_reduc_dt = dt;
7636               cond_reduc_val = op.ops[i];
7637             }
7638           if (dt == vect_induction_def
7639               && def_stmt_info
7640               && is_nonwrapping_integer_induction (def_stmt_info, loop))
7641             {
7642               cond_reduc_dt = dt;
7643               cond_stmt_vinfo = def_stmt_info;
7644             }
7645         }
7646     }
7647   if (!vectype_in)
7648     vectype_in = STMT_VINFO_VECTYPE (phi_info);
7649   STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
7650
7651   enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
7652   STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
7653   /* If we have a condition reduction, see if we can simplify it further.  */
7654   if (v_reduc_type == COND_REDUCTION)
7655     {
7656       if (slp_node)
7657         return false;
7658
7659       /* When the condition uses the reduction value in the condition, fail.  */
7660       if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
7661         {
7662           if (dump_enabled_p ())
7663             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7664                              "condition depends on previous iteration\n");
7665           return false;
7666         }
7667
7668       if (reduc_chain_length == 1
7669           && (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST, vectype_in,
7670                                               OPTIMIZE_FOR_SPEED)
7671               || direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
7672                                                  vectype_in,
7673                                                  OPTIMIZE_FOR_SPEED)))
7674         {
7675           if (dump_enabled_p ())
7676             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7677                              "optimizing condition reduction with"
7678                              " FOLD_EXTRACT_LAST.\n");
7679           STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
7680         }
7681       else if (cond_reduc_dt == vect_induction_def)
7682         {
7683           tree base
7684             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
7685           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
7686
7687           gcc_assert (TREE_CODE (base) == INTEGER_CST
7688                       && TREE_CODE (step) == INTEGER_CST);
7689           cond_reduc_val = NULL_TREE;
7690           enum tree_code cond_reduc_op_code = ERROR_MARK;
7691           tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
7692           if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
7693             ;
7694           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
7695              above base; punt if base is the minimum value of the type for
7696              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
7697           else if (tree_int_cst_sgn (step) == -1)
7698             {
7699               cond_reduc_op_code = MIN_EXPR;
7700               if (tree_int_cst_sgn (base) == -1)
7701                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7702               else if (tree_int_cst_lt (base,
7703                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
7704                 cond_reduc_val
7705                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
7706             }
7707           else
7708             {
7709               cond_reduc_op_code = MAX_EXPR;
7710               if (tree_int_cst_sgn (base) == 1)
7711                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7712               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
7713                                         base))
7714                 cond_reduc_val
7715                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
7716             }
7717           if (cond_reduc_val)
7718             {
7719               if (dump_enabled_p ())
7720                 dump_printf_loc (MSG_NOTE, vect_location,
7721                                  "condition expression based on "
7722                                  "integer induction.\n");
7723               STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
7724               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
7725                 = cond_reduc_val;
7726               STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
7727             }
7728         }
7729       else if (cond_reduc_dt == vect_constant_def)
7730         {
7731           enum vect_def_type cond_initial_dt;
7732           tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
7733           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
7734           if (cond_initial_dt == vect_constant_def
7735               && types_compatible_p (TREE_TYPE (cond_initial_val),
7736                                      TREE_TYPE (cond_reduc_val)))
7737             {
7738               tree e = fold_binary (LE_EXPR, boolean_type_node,
7739                                     cond_initial_val, cond_reduc_val);
7740               if (e && (integer_onep (e) || integer_zerop (e)))
7741                 {
7742                   if (dump_enabled_p ())
7743                     dump_printf_loc (MSG_NOTE, vect_location,
7744                                      "condition expression based on "
7745                                      "compile time constant.\n");
7746                   /* Record reduction code at analysis stage.  */
7747                   STMT_VINFO_REDUC_CODE (reduc_info)
7748                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
7749                   STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
7750                 }
7751             }
7752         }
7753     }
7754
7755   if (STMT_VINFO_LIVE_P (phi_info))
7756     return false;
7757
7758   if (slp_node)
7759     ncopies = 1;
7760   else
7761     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7762
7763   gcc_assert (ncopies >= 1);
7764
7765   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
7766
7767   if (nested_cycle)
7768     {
7769       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
7770                   == vect_double_reduction_def);
7771       double_reduc = true;
7772     }
7773
7774   /* 4.2. Check support for the epilog operation.
7775
7776           If STMT represents a reduction pattern, then the type of the
7777           reduction variable may be different than the type of the rest
7778           of the arguments.  For example, consider the case of accumulation
7779           of shorts into an int accumulator; The original code:
7780                         S1: int_a = (int) short_a;
7781           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
7782
7783           was replaced with:
7784                         STMT: int_acc = widen_sum <short_a, int_acc>
7785
7786           This means that:
7787           1. The tree-code that is used to create the vector operation in the
7788              epilog code (that reduces the partial results) is not the
7789              tree-code of STMT, but is rather the tree-code of the original
7790              stmt from the pattern that STMT is replacing.  I.e, in the example
7791              above we want to use 'widen_sum' in the loop, but 'plus' in the
7792              epilog.
7793           2. The type (mode) we use to check available target support
7794              for the vector operation to be created in the *epilog*, is
7795              determined by the type of the reduction variable (in the example
7796              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7797              However the type (mode) we use to check available target support
7798              for the vector operation to be created *inside the loop*, is
7799              determined by the type of the other arguments to STMT (in the
7800              example we'd check this: optab_handler (widen_sum_optab,
7801              vect_short_mode)).
7802
7803           This is contrary to "regular" reductions, in which the types of all
7804           the arguments are the same as the type of the reduction variable.
7805           For "regular" reductions we can therefore use the same vector type
7806           (and also the same tree-code) when generating the epilog code and
7807           when generating the code inside the loop.  */
7808
7809   code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
7810
7811   /* If conversion might have created a conditional operation like
7812      IFN_COND_ADD already.  Use the internal code for the following checks.  */
7813   if (orig_code.is_internal_fn ())
7814     {
7815       tree_code new_code = conditional_internal_fn_code (internal_fn (orig_code));
7816       orig_code = new_code != ERROR_MARK ? new_code : orig_code;
7817     }
7818
7819   STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
7820
7821   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7822   if (reduction_type == TREE_CODE_REDUCTION)
7823     {
7824       /* Check whether it's ok to change the order of the computation.
7825          Generally, when vectorizing a reduction we change the order of the
7826          computation.  This may change the behavior of the program in some
7827          cases, so we need to check that this is ok.  One exception is when
7828          vectorizing an outer-loop: the inner-loop is executed sequentially,
7829          and therefore vectorizing reductions in the inner-loop during
7830          outer-loop vectorization is safe.  Likewise when we are vectorizing
7831          a series of reductions using SLP and the VF is one the reductions
7832          are performed in scalar order.  */
7833       if (slp_node
7834           && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7835           && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
7836         ;
7837       else if (needs_fold_left_reduction_p (op.type, orig_code))
7838         {
7839           /* When vectorizing a reduction chain w/o SLP the reduction PHI
7840              is not directy used in stmt.  */
7841           if (!only_slp_reduc_chain
7842               && reduc_chain_length != 1)
7843             {
7844               if (dump_enabled_p ())
7845                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7846                                  "in-order reduction chain without SLP.\n");
7847               return false;
7848             }
7849           STMT_VINFO_REDUC_TYPE (reduc_info)
7850             = reduction_type = FOLD_LEFT_REDUCTION;
7851         }
7852       else if (!commutative_binary_op_p (orig_code, op.type)
7853                || !associative_binary_op_p (orig_code, op.type))
7854         {
7855           if (dump_enabled_p ())
7856             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7857                             "reduction: not commutative/associative\n");
7858           return false;
7859         }
7860     }
7861
7862   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7863       && ncopies > 1)
7864     {
7865       if (dump_enabled_p ())
7866         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7867                          "multiple types in double reduction or condition "
7868                          "reduction or fold-left reduction.\n");
7869       return false;
7870     }
7871
7872   internal_fn reduc_fn = IFN_LAST;
7873   if (reduction_type == TREE_CODE_REDUCTION
7874       || reduction_type == FOLD_LEFT_REDUCTION
7875       || reduction_type == INTEGER_INDUC_COND_REDUCTION
7876       || reduction_type == CONST_COND_REDUCTION)
7877     {
7878       if (reduction_type == FOLD_LEFT_REDUCTION
7879           ? fold_left_reduction_fn (orig_code, &reduc_fn)
7880           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7881         {
7882           if (reduc_fn != IFN_LAST
7883               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
7884                                                   OPTIMIZE_FOR_SPEED))
7885             {
7886               if (dump_enabled_p ())
7887                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7888                                  "reduc op not supported by target.\n");
7889
7890               reduc_fn = IFN_LAST;
7891             }
7892         }
7893       else
7894         {
7895           if (!nested_cycle || double_reduc)
7896             {
7897               if (dump_enabled_p ())
7898                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7899                                  "no reduc code for scalar code.\n");
7900
7901               return false;
7902             }
7903         }
7904     }
7905   else if (reduction_type == COND_REDUCTION)
7906     {
7907       int scalar_precision
7908         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
7909       cr_index_scalar_type = make_unsigned_type (scalar_precision);
7910       cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
7911                                                 vectype_out);
7912
7913       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7914                                           OPTIMIZE_FOR_SPEED))
7915         reduc_fn = IFN_REDUC_MAX;
7916     }
7917   STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
7918
7919   if (reduction_type != EXTRACT_LAST_REDUCTION
7920       && (!nested_cycle || double_reduc)
7921       && reduc_fn == IFN_LAST
7922       && !nunits_out.is_constant ())
7923     {
7924       if (dump_enabled_p ())
7925         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7926                          "missing target support for reduction on"
7927                          " variable-length vectors.\n");
7928       return false;
7929     }
7930
7931   /* For SLP reductions, see if there is a neutral value we can use.  */
7932   tree neutral_op = NULL_TREE;
7933   if (slp_node)
7934     {
7935       tree initial_value = NULL_TREE;
7936       if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
7937         initial_value = vect_phi_initial_value (reduc_def_phi);
7938       neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7939                                              orig_code, initial_value);
7940     }
7941
7942   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7943     {
7944       /* We can't support in-order reductions of code such as this:
7945
7946            for (int i = 0; i < n1; ++i)
7947              for (int j = 0; j < n2; ++j)
7948                l += a[j];
7949
7950          since GCC effectively transforms the loop when vectorizing:
7951
7952            for (int i = 0; i < n1 / VF; ++i)
7953              for (int j = 0; j < n2; ++j)
7954                for (int k = 0; k < VF; ++k)
7955                  l += a[j];
7956
7957          which is a reassociation of the original operation.  */
7958       if (dump_enabled_p ())
7959         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7960                          "in-order double reduction not supported.\n");
7961
7962       return false;
7963     }
7964
7965   if (reduction_type == FOLD_LEFT_REDUCTION
7966       && slp_node
7967       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7968     {
7969       /* We cannot use in-order reductions in this case because there is
7970          an implicit reassociation of the operations involved.  */
7971       if (dump_enabled_p ())
7972         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7973                          "in-order unchained SLP reductions not supported.\n");
7974       return false;
7975     }
7976
7977   /* For double reductions, and for SLP reductions with a neutral value,
7978      we construct a variable-length initial vector by loading a vector
7979      full of the neutral value and then shift-and-inserting the start
7980      values into the low-numbered elements.  */
7981   if ((double_reduc || neutral_op)
7982       && !nunits_out.is_constant ()
7983       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7984                                           vectype_out, OPTIMIZE_FOR_SPEED))
7985     {
7986       if (dump_enabled_p ())
7987         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7988                          "reduction on variable-length vectors requires"
7989                          " target support for a vector-shift-and-insert"
7990                          " operation.\n");
7991       return false;
7992     }
7993
7994   /* Check extra constraints for variable-length unchained SLP reductions.  */
7995   if (slp_node
7996       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7997       && !nunits_out.is_constant ())
7998     {
7999       /* We checked above that we could build the initial vector when
8000          there's a neutral element value.  Check here for the case in
8001          which each SLP statement has its own initial value and in which
8002          that value needs to be repeated for every instance of the
8003          statement within the initial vector.  */
8004       unsigned int group_size = SLP_TREE_LANES (slp_node);
8005       if (!neutral_op
8006           && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
8007                                               TREE_TYPE (vectype_out)))
8008         {
8009           if (dump_enabled_p ())
8010             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8011                              "unsupported form of SLP reduction for"
8012                              " variable-length vectors: cannot build"
8013                              " initial vector.\n");
8014           return false;
8015         }
8016       /* The epilogue code relies on the number of elements being a multiple
8017          of the group size.  The duplicate-and-interleave approach to setting
8018          up the initial vector does too.  */
8019       if (!multiple_p (nunits_out, group_size))
8020         {
8021           if (dump_enabled_p ())
8022             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8023                              "unsupported form of SLP reduction for"
8024                              " variable-length vectors: the vector size"
8025                              " is not a multiple of the number of results.\n");
8026           return false;
8027         }
8028     }
8029
8030   if (reduction_type == COND_REDUCTION)
8031     {
8032       widest_int ni;
8033
8034       if (! max_loop_iterations (loop, &ni))
8035         {
8036           if (dump_enabled_p ())
8037             dump_printf_loc (MSG_NOTE, vect_location,
8038                              "loop count not known, cannot create cond "
8039                              "reduction.\n");
8040           return false;
8041         }
8042       /* Convert backedges to iterations.  */
8043       ni += 1;
8044
8045       /* The additional index will be the same type as the condition.  Check
8046          that the loop can fit into this less one (because we'll use up the
8047          zero slot for when there are no matches).  */
8048       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
8049       if (wi::geu_p (ni, wi::to_widest (max_index)))
8050         {
8051           if (dump_enabled_p ())
8052             dump_printf_loc (MSG_NOTE, vect_location,
8053                              "loop size is greater than data size.\n");
8054           return false;
8055         }
8056     }
8057
8058   /* In case the vectorization factor (VF) is bigger than the number
8059      of elements that we can fit in a vectype (nunits), we have to generate
8060      more than one vector stmt - i.e - we need to "unroll" the
8061      vector stmt by a factor VF/nunits.  For more details see documentation
8062      in vectorizable_operation.  */
8063
8064   /* If the reduction is used in an outer loop we need to generate
8065      VF intermediate results, like so (e.g. for ncopies=2):
8066         r0 = phi (init, r0)
8067         r1 = phi (init, r1)
8068         r0 = x0 + r0;
8069         r1 = x1 + r1;
8070     (i.e. we generate VF results in 2 registers).
8071     In this case we have a separate def-use cycle for each copy, and therefore
8072     for each copy we get the vector def for the reduction variable from the
8073     respective phi node created for this copy.
8074
8075     Otherwise (the reduction is unused in the loop nest), we can combine
8076     together intermediate results, like so (e.g. for ncopies=2):
8077         r = phi (init, r)
8078         r = x0 + r;
8079         r = x1 + r;
8080    (i.e. we generate VF/2 results in a single register).
8081    In this case for each copy we get the vector def for the reduction variable
8082    from the vectorized reduction operation generated in the previous iteration.
8083
8084    This only works when we see both the reduction PHI and its only consumer
8085    in vectorizable_reduction and there are no intermediate stmts
8086    participating.  When unrolling we want each unrolled iteration to have its
8087    own reduction accumulator since one of the main goals of unrolling a
8088    reduction is to reduce the aggregate loop-carried latency.  */
8089   if (ncopies > 1
8090       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
8091       && reduc_chain_length == 1
8092       && loop_vinfo->suggested_unroll_factor == 1)
8093     single_defuse_cycle = true;
8094
8095   if (single_defuse_cycle || lane_reduc_code_p)
8096     {
8097       gcc_assert (op.code != COND_EXPR);
8098
8099       /* 4. Supportable by target?  */
8100       bool ok = true;
8101
8102       /* 4.1. check support for the operation in the loop
8103
8104          This isn't necessary for the lane reduction codes, since they
8105          can only be produced by pattern matching, and it's up to the
8106          pattern matcher to test for support.  The main reason for
8107          specifically skipping this step is to avoid rechecking whether
8108          mixed-sign dot-products can be implemented using signed
8109          dot-products.  */
8110       machine_mode vec_mode = TYPE_MODE (vectype_in);
8111       if (!lane_reduc_code_p
8112           && !directly_supported_p (op.code, vectype_in, optab_vector))
8113         {
8114           if (dump_enabled_p ())
8115             dump_printf (MSG_NOTE, "op not supported by target.\n");
8116           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
8117               || !vect_can_vectorize_without_simd_p (op.code))
8118             ok = false;
8119           else
8120             if (dump_enabled_p ())
8121               dump_printf (MSG_NOTE, "proceeding using word mode.\n");
8122         }
8123
8124       if (vect_emulated_vector_p (vectype_in)
8125           && !vect_can_vectorize_without_simd_p (op.code))
8126         {
8127           if (dump_enabled_p ())
8128             dump_printf (MSG_NOTE, "using word mode not possible.\n");
8129           return false;
8130         }
8131
8132       /* lane-reducing operations have to go through vect_transform_reduction.
8133          For the other cases try without the single cycle optimization.  */
8134       if (!ok)
8135         {
8136           if (lane_reduc_code_p)
8137             return false;
8138           else
8139             single_defuse_cycle = false;
8140         }
8141     }
8142   STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
8143
8144   /* If the reduction stmt is one of the patterns that have lane
8145      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
8146   if ((ncopies > 1 && ! single_defuse_cycle)
8147       && lane_reduc_code_p)
8148     {
8149       if (dump_enabled_p ())
8150         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8151                          "multi def-use cycle not possible for lane-reducing "
8152                          "reduction operation\n");
8153       return false;
8154     }
8155
8156   if (slp_node
8157       && !(!single_defuse_cycle
8158            && !lane_reduc_code_p
8159            && reduction_type != FOLD_LEFT_REDUCTION))
8160     for (i = 0; i < (int) op.num_ops; i++)
8161       if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
8162         {
8163           if (dump_enabled_p ())
8164             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8165                              "incompatible vector types for invariants\n");
8166           return false;
8167         }
8168
8169   if (slp_node)
8170     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8171   else
8172     vec_num = 1;
8173
8174   vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
8175                              reduction_type, ncopies, cost_vec);
8176   /* Cost the reduction op inside the loop if transformed via
8177      vect_transform_reduction.  Otherwise this is costed by the
8178      separate vectorizable_* routines.  */
8179   if (single_defuse_cycle || lane_reduc_code_p)
8180     {
8181       int factor = 1;
8182       if (vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info))
8183         /* Three dot-products and a subtraction.  */
8184         factor = 4;
8185       record_stmt_cost (cost_vec, ncopies * factor, vector_stmt,
8186                         stmt_info, 0, vect_body);
8187     }
8188
8189   if (dump_enabled_p ()
8190       && reduction_type == FOLD_LEFT_REDUCTION)
8191     dump_printf_loc (MSG_NOTE, vect_location,
8192                      "using an in-order (fold-left) reduction.\n");
8193   STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
8194   /* All but single defuse-cycle optimized, lane-reducing and fold-left
8195      reductions go through their own vectorizable_* routines.  */
8196   if (!single_defuse_cycle
8197       && !lane_reduc_code_p
8198       && reduction_type != FOLD_LEFT_REDUCTION)
8199     {
8200       stmt_vec_info tem
8201         = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
8202       if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
8203         {
8204           gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
8205           tem = REDUC_GROUP_FIRST_ELEMENT (tem);
8206         }
8207       STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
8208       STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
8209     }
8210   else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8211     {
8212       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8213       vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8214       internal_fn cond_fn = get_conditional_internal_fn (op.code, op.type);
8215
8216       if (reduction_type != FOLD_LEFT_REDUCTION
8217           && !use_mask_by_cond_expr_p (op.code, cond_fn, vectype_in)
8218           && (cond_fn == IFN_LAST
8219               || !direct_internal_fn_supported_p (cond_fn, vectype_in,
8220                                                   OPTIMIZE_FOR_SPEED)))
8221         {
8222           if (dump_enabled_p ())
8223             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8224                              "can't operate on partial vectors because"
8225                              " no conditional operation is available.\n");
8226           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8227         }
8228       else if (reduction_type == FOLD_LEFT_REDUCTION
8229                && reduc_fn == IFN_LAST
8230                && !expand_vec_cond_expr_p (vectype_in,
8231                                            truth_type_for (vectype_in),
8232                                            SSA_NAME))
8233         {
8234           if (dump_enabled_p ())
8235             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8236                              "can't operate on partial vectors because"
8237                              " no conditional operation is available.\n");
8238           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8239         }
8240       else if (reduction_type == FOLD_LEFT_REDUCTION
8241                && internal_fn_mask_index (reduc_fn) == -1
8242                && FLOAT_TYPE_P (vectype_in)
8243                && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in))
8244         {
8245           if (dump_enabled_p ())
8246             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8247                              "can't operate on partial vectors because"
8248                              " signed zeros cannot be preserved.\n");
8249           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8250         }
8251       else
8252         {
8253           internal_fn mask_reduc_fn
8254             = get_masked_reduction_fn (reduc_fn, vectype_in);
8255
8256           if (mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
8257             vect_record_loop_len (loop_vinfo, lens, ncopies * vec_num,
8258                                   vectype_in, 1);
8259           else
8260             vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
8261                                    vectype_in, NULL);
8262         }
8263     }
8264   return true;
8265 }
8266
8267 /* STMT_INFO is a dot-product reduction whose multiplication operands
8268    have different signs.  Emit a sequence to emulate the operation
8269    using a series of signed DOT_PROD_EXPRs and return the last
8270    statement generated.  VEC_DEST is the result of the vector operation
8271    and VOP lists its inputs.  */
8272
8273 static gassign *
8274 vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8275                              gimple_stmt_iterator *gsi, tree vec_dest,
8276                              tree vop[3])
8277 {
8278   tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest));
8279   tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0]));
8280   tree narrow_elttype = TREE_TYPE (narrow_vectype);
8281   gimple *new_stmt;
8282
8283   /* Make VOP[0] the unsigned operand VOP[1] the signed operand.  */
8284   if (!TYPE_UNSIGNED (TREE_TYPE (vop[0])))
8285     std::swap (vop[0], vop[1]);
8286
8287   /* Convert all inputs to signed types.  */
8288   for (int i = 0; i < 3; ++i)
8289     if (TYPE_UNSIGNED (TREE_TYPE (vop[i])))
8290       {
8291         tree tmp = make_ssa_name (signed_type_for (TREE_TYPE (vop[i])));
8292         new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]);
8293         vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8294         vop[i] = tmp;
8295       }
8296
8297   /* In the comments below we assume 8-bit inputs for simplicity,
8298      but the approach works for any full integer type.  */
8299
8300   /* Create a vector of -128.  */
8301   tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype);
8302   tree min_narrow = build_vector_from_val (narrow_vectype,
8303                                            min_narrow_elttype);
8304
8305   /* Create a vector of 64.  */
8306   auto half_wi = wi::lrshift (wi::to_wide (min_narrow_elttype), 1);
8307   tree half_narrow = wide_int_to_tree (narrow_elttype, half_wi);
8308   half_narrow = build_vector_from_val (narrow_vectype, half_narrow);
8309
8310   /* Emit: SUB_RES = VOP[0] - 128.  */
8311   tree sub_res = make_ssa_name (narrow_vectype);
8312   new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow);
8313   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8314
8315   /* Emit:
8316
8317        STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
8318        STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
8319        STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
8320
8321      on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
8322      Doing the two 64 * y steps first allows more time to compute x.  */
8323   tree stage1 = make_ssa_name (wide_vectype);
8324   new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR,
8325                                   vop[1], half_narrow, vop[2]);
8326   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8327
8328   tree stage2 = make_ssa_name (wide_vectype);
8329   new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR,
8330                                   vop[1], half_narrow, stage1);
8331   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8332
8333   tree stage3 = make_ssa_name (wide_vectype);
8334   new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR,
8335                                   sub_res, vop[1], stage2);
8336   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8337
8338   /* Convert STAGE3 to the reduction type.  */
8339   return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3);
8340 }
8341
8342 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
8343    value.  */
8344
8345 bool
8346 vect_transform_reduction (loop_vec_info loop_vinfo,
8347                           stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
8348                           gimple **vec_stmt, slp_tree slp_node)
8349 {
8350   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8351   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8352   int i;
8353   int ncopies;
8354   int vec_num;
8355
8356   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8357   gcc_assert (reduc_info->is_reduc_info);
8358
8359   if (nested_in_vect_loop_p (loop, stmt_info))
8360     {
8361       loop = loop->inner;
8362       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
8363     }
8364
8365   gimple_match_op op;
8366   if (!gimple_extract_op (stmt_info->stmt, &op))
8367     gcc_unreachable ();
8368
8369   /* All uses but the last are expected to be defined in the loop.
8370      The last use is the reduction variable.  In case of nested cycle this
8371      assumption is not true: we use reduc_index to record the index of the
8372      reduction variable.  */
8373   stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
8374   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
8375   int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
8376   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8377
8378   if (slp_node)
8379     {
8380       ncopies = 1;
8381       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8382     }
8383   else
8384     {
8385       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8386       vec_num = 1;
8387     }
8388
8389   code_helper code = canonicalize_code (op.code, op.type);
8390   internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
8391
8392   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8393   vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8394   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
8395
8396   /* Transform.  */
8397   tree new_temp = NULL_TREE;
8398   auto_vec<tree> vec_oprnds0;
8399   auto_vec<tree> vec_oprnds1;
8400   auto_vec<tree> vec_oprnds2;
8401   tree def0;
8402
8403   if (dump_enabled_p ())
8404     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
8405
8406   /* FORNOW: Multiple types are not supported for condition.  */
8407   if (code == COND_EXPR)
8408     gcc_assert (ncopies == 1);
8409
8410   /* A binary COND_OP reduction must have the same definition and else
8411      value. */
8412   bool cond_fn_p = code.is_internal_fn ()
8413     && conditional_internal_fn_code (internal_fn (code)) != ERROR_MARK;
8414   if (cond_fn_p)
8415     {
8416       gcc_assert (code == IFN_COND_ADD || code == IFN_COND_SUB
8417                   || code == IFN_COND_MUL || code == IFN_COND_AND
8418                   || code == IFN_COND_IOR || code == IFN_COND_XOR);
8419       gcc_assert (op.num_ops == 4 && (op.ops[1] == op.ops[3]));
8420     }
8421
8422   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8423
8424   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
8425   if (reduction_type == FOLD_LEFT_REDUCTION)
8426     {
8427       internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
8428       gcc_assert (code.is_tree_code () || cond_fn_p);
8429       return vectorize_fold_left_reduction
8430           (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
8431            code, reduc_fn, op.ops, op.num_ops, vectype_in,
8432            reduc_index, masks, lens);
8433     }
8434
8435   bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
8436   gcc_assert (single_defuse_cycle
8437               || code == DOT_PROD_EXPR
8438               || code == WIDEN_SUM_EXPR
8439               || code == SAD_EXPR);
8440
8441   /* Create the destination vector  */
8442   tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
8443   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
8444
8445   /* Get NCOPIES vector definitions for all operands except the reduction
8446      definition.  */
8447   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
8448                      single_defuse_cycle && reduc_index == 0
8449                      ? NULL_TREE : op.ops[0], &vec_oprnds0,
8450                      single_defuse_cycle && reduc_index == 1
8451                      ? NULL_TREE : op.ops[1], &vec_oprnds1,
8452                      op.num_ops == 4
8453                      || (op.num_ops == 3
8454                          && !(single_defuse_cycle && reduc_index == 2))
8455                      ? op.ops[2] : NULL_TREE, &vec_oprnds2);
8456
8457   /* For single def-use cycles get one copy of the vectorized reduction
8458      definition.  */
8459   if (single_defuse_cycle)
8460     {
8461       gcc_assert (!slp_node);
8462       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8463                                      op.ops[reduc_index],
8464                                      reduc_index == 0 ? &vec_oprnds0
8465                                      : (reduc_index == 1 ? &vec_oprnds1
8466                                         : &vec_oprnds2));
8467     }
8468
8469   bool emulated_mixed_dot_prod
8470     = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
8471   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
8472     {
8473       gimple *new_stmt;
8474       tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
8475       if (masked_loop_p && !mask_by_cond_expr)
8476         {
8477           /* No conditional ifns have been defined for dot-product yet.  */
8478           gcc_assert (code != DOT_PROD_EXPR);
8479
8480           /* Make sure that the reduction accumulator is vop[0].  */
8481           if (reduc_index == 1)
8482             {
8483               gcc_assert (commutative_binary_op_p (code, op.type));
8484               std::swap (vop[0], vop[1]);
8485             }
8486           tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8487                                           vec_num * ncopies, vectype_in, i);
8488           gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
8489                                                     vop[0], vop[1], vop[0]);
8490           new_temp = make_ssa_name (vec_dest, call);
8491           gimple_call_set_lhs (call, new_temp);
8492           gimple_call_set_nothrow (call, true);
8493           vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
8494           new_stmt = call;
8495         }
8496       else
8497         {
8498           if (op.num_ops >= 3)
8499             vop[2] = vec_oprnds2[i];
8500
8501           if (masked_loop_p && mask_by_cond_expr)
8502             {
8503               tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8504                                               vec_num * ncopies, vectype_in, i);
8505               build_vect_cond_expr (code, vop, mask, gsi);
8506             }
8507
8508           if (emulated_mixed_dot_prod)
8509             new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
8510                                                     vec_dest, vop);
8511
8512           else if (code.is_internal_fn () && !cond_fn_p)
8513             new_stmt = gimple_build_call_internal (internal_fn (code),
8514                                                    op.num_ops,
8515                                                    vop[0], vop[1], vop[2]);
8516           else if (code.is_internal_fn () && cond_fn_p)
8517             new_stmt = gimple_build_call_internal (internal_fn (code),
8518                                                    op.num_ops,
8519                                                    vop[0], vop[1], vop[2],
8520                                                    vop[1]);
8521           else
8522             new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
8523                                             vop[0], vop[1], vop[2]);
8524           new_temp = make_ssa_name (vec_dest, new_stmt);
8525           gimple_set_lhs (new_stmt, new_temp);
8526           vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8527         }
8528
8529       if (slp_node)
8530         slp_node->push_vec_def (new_stmt);
8531       else if (single_defuse_cycle
8532                && i < ncopies - 1)
8533         {
8534           if (reduc_index == 0)
8535             vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
8536           else if (reduc_index == 1)
8537             vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
8538           else if (reduc_index == 2)
8539             vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
8540         }
8541       else
8542         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8543     }
8544
8545   if (!slp_node)
8546     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8547
8548   return true;
8549 }
8550
8551 /* Transform phase of a cycle PHI.  */
8552
8553 bool
8554 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
8555                           stmt_vec_info stmt_info, gimple **vec_stmt,
8556                           slp_tree slp_node, slp_instance slp_node_instance)
8557 {
8558   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8559   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8560   int i;
8561   int ncopies;
8562   int j;
8563   bool nested_cycle = false;
8564   int vec_num;
8565
8566   if (nested_in_vect_loop_p (loop, stmt_info))
8567     {
8568       loop = loop->inner;
8569       nested_cycle = true;
8570     }
8571
8572   stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
8573   reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
8574   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8575   gcc_assert (reduc_info->is_reduc_info);
8576
8577   if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
8578       || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
8579     /* Leave the scalar phi in place.  */
8580     return true;
8581
8582   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8583   /* For a nested cycle we do not fill the above.  */
8584   if (!vectype_in)
8585     vectype_in = STMT_VINFO_VECTYPE (stmt_info);
8586   gcc_assert (vectype_in);
8587
8588   if (slp_node)
8589     {
8590       /* The size vect_schedule_slp_instance computes is off for us.  */
8591       vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
8592                                       * SLP_TREE_LANES (slp_node), vectype_in);
8593       ncopies = 1;
8594     }
8595   else
8596     {
8597       vec_num = 1;
8598       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8599     }
8600
8601   /* Check whether we should use a single PHI node and accumulate
8602      vectors to one before the backedge.  */
8603   if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
8604     ncopies = 1;
8605
8606   /* Create the destination vector  */
8607   gphi *phi = as_a <gphi *> (stmt_info->stmt);
8608   tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
8609                                                vectype_out);
8610
8611   /* Get the loop-entry arguments.  */
8612   tree vec_initial_def = NULL_TREE;
8613   auto_vec<tree> vec_initial_defs;
8614   if (slp_node)
8615     {
8616       vec_initial_defs.reserve (vec_num);
8617       if (nested_cycle)
8618         {
8619           unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
8620           vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
8621                              &vec_initial_defs);
8622         }
8623       else
8624         {
8625           gcc_assert (slp_node == slp_node_instance->reduc_phis);
8626           vec<tree> &initial_values = reduc_info->reduc_initial_values;
8627           vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
8628
8629           unsigned int num_phis = stmts.length ();
8630           if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
8631             num_phis = 1;
8632           initial_values.reserve (num_phis);
8633           for (unsigned int i = 0; i < num_phis; ++i)
8634             {
8635               gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
8636               initial_values.quick_push (vect_phi_initial_value (this_phi));
8637             }
8638           if (vec_num == 1)
8639             vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8640           if (!initial_values.is_empty ())
8641             {
8642               tree initial_value
8643                 = (num_phis == 1 ? initial_values[0] : NULL_TREE);
8644               code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8645               tree neutral_op
8646                 = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8647                                             code, initial_value);
8648               get_initial_defs_for_reduction (loop_vinfo, reduc_info,
8649                                               &vec_initial_defs, vec_num,
8650                                               stmts.length (), neutral_op);
8651             }
8652         }
8653     }
8654   else
8655     {
8656       /* Get at the scalar def before the loop, that defines the initial
8657          value of the reduction variable.  */
8658       tree initial_def = vect_phi_initial_value (phi);
8659       reduc_info->reduc_initial_values.safe_push (initial_def);
8660       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
8661          and we can't use zero for induc_val, use initial_def.  Similarly
8662          for REDUC_MIN and initial_def larger than the base.  */
8663       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
8664         {
8665           tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
8666           if (TREE_CODE (initial_def) == INTEGER_CST
8667               && !integer_zerop (induc_val)
8668               && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
8669                    && tree_int_cst_lt (initial_def, induc_val))
8670                   || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
8671                       && tree_int_cst_lt (induc_val, initial_def))))
8672             {
8673               induc_val = initial_def;
8674               /* Communicate we used the initial_def to epilouge
8675                  generation.  */
8676               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
8677             }
8678           vec_initial_def = build_vector_from_val (vectype_out, induc_val);
8679         }
8680       else if (nested_cycle)
8681         {
8682           /* Do not use an adjustment def as that case is not supported
8683              correctly if ncopies is not one.  */
8684           vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
8685                                          ncopies, initial_def,
8686                                          &vec_initial_defs);
8687         }
8688       else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
8689                || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
8690         /* Fill the initial vector with the initial scalar value.  */
8691         vec_initial_def
8692           = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
8693                                            initial_def, initial_def);
8694       else
8695         {
8696           if (ncopies == 1)
8697             vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8698           if (!reduc_info->reduc_initial_values.is_empty ())
8699             {
8700               initial_def = reduc_info->reduc_initial_values[0];
8701               code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8702               tree neutral_op
8703                 = neutral_op_for_reduction (TREE_TYPE (initial_def),
8704                                             code, initial_def);
8705               gcc_assert (neutral_op);
8706               /* Try to simplify the vector initialization by applying an
8707                  adjustment after the reduction has been performed.  */
8708               if (!reduc_info->reused_accumulator
8709                   && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8710                   && !operand_equal_p (neutral_op, initial_def))
8711                 {
8712                   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
8713                     = initial_def;
8714                   initial_def = neutral_op;
8715                 }
8716               vec_initial_def
8717                 = get_initial_def_for_reduction (loop_vinfo, reduc_info,
8718                                                  initial_def, neutral_op);
8719             }
8720         }
8721     }
8722
8723   if (vec_initial_def)
8724     {
8725       vec_initial_defs.create (ncopies);
8726       for (i = 0; i < ncopies; ++i)
8727         vec_initial_defs.quick_push (vec_initial_def);
8728     }
8729
8730   if (auto *accumulator = reduc_info->reused_accumulator)
8731     {
8732       tree def = accumulator->reduc_input;
8733       if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8734         {
8735           unsigned int nreduc;
8736           bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
8737                                             (TREE_TYPE (def)),
8738                                           TYPE_VECTOR_SUBPARTS (vectype_out),
8739                                           &nreduc);
8740           gcc_assert (res);
8741           gimple_seq stmts = NULL;
8742           /* Reduce the single vector to a smaller one.  */
8743           if (nreduc != 1)
8744             {
8745               /* Perform the reduction in the appropriate type.  */
8746               tree rvectype = vectype_out;
8747               if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
8748                                               TREE_TYPE (TREE_TYPE (def))))
8749                 rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
8750                                               TYPE_VECTOR_SUBPARTS
8751                                                 (vectype_out));
8752               def = vect_create_partial_epilog (def, rvectype,
8753                                                 STMT_VINFO_REDUC_CODE
8754                                                   (reduc_info),
8755                                                 &stmts);
8756             }
8757           /* The epilogue loop might use a different vector mode, like
8758              VNx2DI vs. V2DI.  */
8759           if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
8760             {
8761               tree reduc_type = build_vector_type_for_mode
8762                 (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
8763               def = gimple_convert (&stmts, reduc_type, def);
8764             }
8765           /* Adjust the input so we pick up the partially reduced value
8766              for the skip edge in vect_create_epilog_for_reduction.  */
8767           accumulator->reduc_input = def;
8768           /* And the reduction could be carried out using a different sign.  */
8769           if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8770             def = gimple_convert (&stmts, vectype_out, def);
8771           if (loop_vinfo->main_loop_edge)
8772             {
8773               /* While we'd like to insert on the edge this will split
8774                  blocks and disturb bookkeeping, we also will eventually
8775                  need this on the skip edge.  Rely on sinking to
8776                  fixup optimal placement and insert in the pred.  */
8777               gimple_stmt_iterator gsi
8778                 = gsi_last_bb (loop_vinfo->main_loop_edge->src);
8779               /* Insert before a cond that eventually skips the
8780                  epilogue.  */
8781               if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
8782                 gsi_prev (&gsi);
8783               gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
8784             }
8785           else
8786             gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
8787                                               stmts);
8788         }
8789       if (loop_vinfo->main_loop_edge)
8790         vec_initial_defs[0]
8791           = vect_get_main_loop_result (loop_vinfo, def,
8792                                        vec_initial_defs[0]);
8793       else
8794         vec_initial_defs.safe_push (def);
8795     }
8796
8797   /* Generate the reduction PHIs upfront.  */
8798   for (i = 0; i < vec_num; i++)
8799     {
8800       tree vec_init_def = vec_initial_defs[i];
8801       for (j = 0; j < ncopies; j++)
8802         {
8803           /* Create the reduction-phi that defines the reduction
8804              operand.  */
8805           gphi *new_phi = create_phi_node (vec_dest, loop->header);
8806
8807           /* Set the loop-entry arg of the reduction-phi.  */
8808           if (j != 0 && nested_cycle)
8809             vec_init_def = vec_initial_defs[j];
8810           add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
8811                        UNKNOWN_LOCATION);
8812
8813           /* The loop-latch arg is set in epilogue processing.  */
8814
8815           if (slp_node)
8816             slp_node->push_vec_def (new_phi);
8817           else
8818             {
8819               if (j == 0)
8820                 *vec_stmt = new_phi;
8821               STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8822             }
8823         }
8824     }
8825
8826   return true;
8827 }
8828
8829 /* Vectorizes LC PHIs.  */
8830
8831 bool
8832 vectorizable_lc_phi (loop_vec_info loop_vinfo,
8833                      stmt_vec_info stmt_info, gimple **vec_stmt,
8834                      slp_tree slp_node)
8835 {
8836   if (!loop_vinfo
8837       || !is_a <gphi *> (stmt_info->stmt)
8838       || gimple_phi_num_args (stmt_info->stmt) != 1)
8839     return false;
8840
8841   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
8842       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
8843     return false;
8844
8845   if (!vec_stmt) /* transformation not required.  */
8846     {
8847       /* Deal with copies from externs or constants that disguise as
8848          loop-closed PHI nodes (PR97886).  */
8849       if (slp_node
8850           && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
8851                                                 SLP_TREE_VECTYPE (slp_node)))
8852         {
8853           if (dump_enabled_p ())
8854             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8855                              "incompatible vector types for invariants\n");
8856           return false;
8857         }
8858       STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
8859       return true;
8860     }
8861
8862   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8863   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8864   basic_block bb = gimple_bb (stmt_info->stmt);
8865   edge e = single_pred_edge (bb);
8866   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8867   auto_vec<tree> vec_oprnds;
8868   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
8869                      !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
8870                      gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
8871   for (unsigned i = 0; i < vec_oprnds.length (); i++)
8872     {
8873       /* Create the vectorized LC PHI node.  */
8874       gphi *new_phi = create_phi_node (vec_dest, bb);
8875       add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
8876       if (slp_node)
8877         slp_node->push_vec_def (new_phi);
8878       else
8879         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8880     }
8881   if (!slp_node)
8882     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8883
8884   return true;
8885 }
8886
8887 /* Vectorizes PHIs.  */
8888
8889 bool
8890 vectorizable_phi (vec_info *,
8891                   stmt_vec_info stmt_info, gimple **vec_stmt,
8892                   slp_tree slp_node, stmt_vector_for_cost *cost_vec)
8893 {
8894   if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
8895     return false;
8896
8897   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
8898     return false;
8899
8900   tree vectype = SLP_TREE_VECTYPE (slp_node);
8901
8902   if (!vec_stmt) /* transformation not required.  */
8903     {
8904       slp_tree child;
8905       unsigned i;
8906       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
8907         if (!child)
8908           {
8909             if (dump_enabled_p ())
8910               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8911                                "PHI node with unvectorized backedge def\n");
8912             return false;
8913           }
8914         else if (!vect_maybe_update_slp_op_vectype (child, vectype))
8915           {
8916             if (dump_enabled_p ())
8917               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8918                                "incompatible vector types for invariants\n");
8919             return false;
8920           }
8921         else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8922                  && !useless_type_conversion_p (vectype,
8923                                                 SLP_TREE_VECTYPE (child)))
8924           {
8925             /* With bools we can have mask and non-mask precision vectors
8926                or different non-mask precisions.  while pattern recog is
8927                supposed to guarantee consistency here bugs in it can cause
8928                mismatches (PR103489 and PR103800 for example).
8929                Deal with them here instead of ICEing later.  */
8930             if (dump_enabled_p ())
8931               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8932                                "incompatible vector type setup from "
8933                                "bool pattern detection\n");
8934             return false;
8935           }
8936
8937       /* For single-argument PHIs assume coalescing which means zero cost
8938          for the scalar and the vector PHIs.  This avoids artificially
8939          favoring the vector path (but may pessimize it in some cases).  */
8940       if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
8941         record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
8942                           vector_stmt, stmt_info, vectype, 0, vect_body);
8943       STMT_VINFO_TYPE (stmt_info) = phi_info_type;
8944       return true;
8945     }
8946
8947   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8948   basic_block bb = gimple_bb (stmt_info->stmt);
8949   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8950   auto_vec<gphi *> new_phis;
8951   for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
8952     {
8953       slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8954
8955       /* Skip not yet vectorized defs.  */
8956       if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8957           && SLP_TREE_VEC_DEFS (child).is_empty ())
8958         continue;
8959
8960       auto_vec<tree> vec_oprnds;
8961       vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
8962       if (!new_phis.exists ())
8963         {
8964           new_phis.create (vec_oprnds.length ());
8965           for (unsigned j = 0; j < vec_oprnds.length (); j++)
8966             {
8967               /* Create the vectorized LC PHI node.  */
8968               new_phis.quick_push (create_phi_node (vec_dest, bb));
8969               slp_node->push_vec_def (new_phis[j]);
8970             }
8971         }
8972       edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
8973       for (unsigned j = 0; j < vec_oprnds.length (); j++)
8974         add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
8975     }
8976   /* We should have at least one already vectorized child.  */
8977   gcc_assert (new_phis.exists ());
8978
8979   return true;
8980 }
8981
8982 /* Vectorizes first order recurrences.  An overview of the transformation
8983    is described below. Suppose we have the following loop.
8984
8985      int t = 0;
8986      for (int i = 0; i < n; ++i)
8987        {
8988          b[i] = a[i] - t;
8989          t = a[i];
8990        }
8991
8992    There is a first-order recurrence on 'a'. For this loop, the scalar IR
8993    looks (simplified) like:
8994
8995     scalar.preheader:
8996       init = 0;
8997
8998     scalar.body:
8999       i = PHI <0(scalar.preheader), i+1(scalar.body)>
9000       _2 = PHI <(init(scalar.preheader), <_1(scalar.body)>
9001       _1 = a[i]
9002       b[i] = _1 - _2
9003       if (i < n) goto scalar.body
9004
9005    In this example, _2 is a recurrence because it's value depends on the
9006    previous iteration.  We vectorize this as (VF = 4)
9007
9008     vector.preheader:
9009       vect_init = vect_cst(..., ..., ..., 0)
9010
9011     vector.body
9012       i = PHI <0(vector.preheader), i+4(vector.body)>
9013       vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)>
9014       vect_2 = a[i, i+1, i+2, i+3];
9015       vect_3 = vec_perm (vect_1, vect_2, { 3, 4, 5, 6 })
9016       b[i, i+1, i+2, i+3] = vect_2 - vect_3
9017       if (..) goto vector.body
9018
9019    In this function, vectorizable_recurr, we code generate both the
9020    vector PHI node and the permute since those together compute the
9021    vectorized value of the scalar PHI.  We do not yet have the
9022    backedge value to fill in there nor into the vec_perm.  Those
9023    are filled in maybe_set_vectorized_backedge_value and
9024    vect_schedule_scc.
9025
9026    TODO:  Since the scalar loop does not have a use of the recurrence
9027    outside of the loop the natural way to implement peeling via
9028    vectorizing the live value doesn't work.  For now peeling of loops
9029    with a recurrence is not implemented.  For SLP the supported cases
9030    are restricted to those requiring a single vector recurrence PHI.  */
9031
9032 bool
9033 vectorizable_recurr (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
9034                      gimple **vec_stmt, slp_tree slp_node,
9035                      stmt_vector_for_cost *cost_vec)
9036 {
9037   if (!loop_vinfo || !is_a<gphi *> (stmt_info->stmt))
9038     return false;
9039
9040   gphi *phi = as_a<gphi *> (stmt_info->stmt);
9041
9042   /* So far we only support first-order recurrence auto-vectorization.  */
9043   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
9044     return false;
9045
9046   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9047   unsigned ncopies;
9048   if (slp_node)
9049     ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
9050   else
9051     ncopies = vect_get_num_copies (loop_vinfo, vectype);
9052   poly_int64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9053   unsigned dist = slp_node ? SLP_TREE_LANES (slp_node) : 1;
9054   /* We need to be able to make progress with a single vector.  */
9055   if (maybe_gt (dist * 2, nunits))
9056     {
9057       if (dump_enabled_p ())
9058         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9059                          "first order recurrence exceeds half of "
9060                          "a vector\n");
9061       return false;
9062     }
9063
9064   /* First-order recurrence autovectorization needs to handle permutation
9065      with indices = [nunits-1, nunits, nunits+1, ...].  */
9066   vec_perm_builder sel (nunits, 1, 3);
9067   for (int i = 0; i < 3; ++i)
9068     sel.quick_push (nunits - dist + i);
9069   vec_perm_indices indices (sel, 2, nunits);
9070
9071   if (!vec_stmt) /* transformation not required.  */
9072     {
9073       if (!can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
9074                                  indices))
9075         return false;
9076
9077       if (slp_node)
9078         {
9079           /* We eventually need to set a vector type on invariant
9080              arguments.  */
9081           unsigned j;
9082           slp_tree child;
9083           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9084             if (!vect_maybe_update_slp_op_vectype
9085                   (child, SLP_TREE_VECTYPE (slp_node)))
9086               {
9087                 if (dump_enabled_p ())
9088                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9089                                    "incompatible vector types for "
9090                                    "invariants\n");
9091                 return false;
9092               }
9093         }
9094       /* The recurrence costs the initialization vector and one permute
9095          for each copy.  */
9096       unsigned prologue_cost = record_stmt_cost (cost_vec, 1, scalar_to_vec,
9097                                                  stmt_info, 0, vect_prologue);
9098       unsigned inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9099                                                stmt_info, 0, vect_body);
9100       if (dump_enabled_p ())
9101         dump_printf_loc (MSG_NOTE, vect_location,
9102                          "vectorizable_recurr: inside_cost = %d, "
9103                          "prologue_cost = %d .\n", inside_cost,
9104                          prologue_cost);
9105
9106       STMT_VINFO_TYPE (stmt_info) = recurr_info_type;
9107       return true;
9108     }
9109
9110   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
9111   basic_block bb = gimple_bb (phi);
9112   tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
9113   if (!useless_type_conversion_p (TREE_TYPE (vectype), TREE_TYPE (preheader)))
9114     {
9115       gimple_seq stmts = NULL;
9116       preheader = gimple_convert (&stmts, TREE_TYPE (vectype), preheader);
9117       gsi_insert_seq_on_edge_immediate (pe, stmts);
9118     }
9119   tree vec_init = build_vector_from_val (vectype, preheader);
9120   vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL);
9121
9122   /* Create the vectorized first-order PHI node.  */
9123   tree vec_dest = vect_get_new_vect_var (vectype,
9124                                          vect_simple_var, "vec_recur_");
9125   gphi *new_phi = create_phi_node (vec_dest, bb);
9126   add_phi_arg (new_phi, vec_init, pe, UNKNOWN_LOCATION);
9127
9128   /* Insert shuffles the first-order recurrence autovectorization.
9129        result = VEC_PERM <vec_recur, vect_1, index[nunits-1, nunits, ...]>.  */
9130   tree perm = vect_gen_perm_mask_checked (vectype, indices);
9131
9132   /* Insert the required permute after the latch definition.  The
9133      second and later operands are tentative and will be updated when we have
9134      vectorized the latch definition.  */
9135   edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
9136   gimple *latch_def = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, le));
9137   gimple_stmt_iterator gsi2 = gsi_for_stmt (latch_def);
9138   gsi_next (&gsi2);
9139
9140   for (unsigned i = 0; i < ncopies; ++i)
9141     {
9142       vec_dest = make_ssa_name (vectype);
9143       gassign *vperm
9144           = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
9145                                  i == 0 ? gimple_phi_result (new_phi) : NULL,
9146                                  NULL, perm);
9147       vect_finish_stmt_generation (loop_vinfo, stmt_info, vperm, &gsi2);
9148
9149       if (slp_node)
9150         slp_node->push_vec_def (vperm);
9151       else
9152         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (vperm);
9153     }
9154
9155   if (!slp_node)
9156     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
9157   return true;
9158 }
9159
9160 /* Return true if VECTYPE represents a vector that requires lowering
9161    by the vector lowering pass.  */
9162
9163 bool
9164 vect_emulated_vector_p (tree vectype)
9165 {
9166   return (!VECTOR_MODE_P (TYPE_MODE (vectype))
9167           && (!VECTOR_BOOLEAN_TYPE_P (vectype)
9168               || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
9169 }
9170
9171 /* Return true if we can emulate CODE on an integer mode representation
9172    of a vector.  */
9173
9174 bool
9175 vect_can_vectorize_without_simd_p (tree_code code)
9176 {
9177   switch (code)
9178     {
9179     case PLUS_EXPR:
9180     case MINUS_EXPR:
9181     case NEGATE_EXPR:
9182     case BIT_AND_EXPR:
9183     case BIT_IOR_EXPR:
9184     case BIT_XOR_EXPR:
9185     case BIT_NOT_EXPR:
9186       return true;
9187
9188     default:
9189       return false;
9190     }
9191 }
9192
9193 /* Likewise, but taking a code_helper.  */
9194
9195 bool
9196 vect_can_vectorize_without_simd_p (code_helper code)
9197 {
9198   return (code.is_tree_code ()
9199           && vect_can_vectorize_without_simd_p (tree_code (code)));
9200 }
9201
9202 /* Create vector init for vectorized iv.  */
9203 static tree
9204 vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9205                                tree step_expr, poly_uint64 nunits,
9206                                tree vectype,
9207                                enum vect_induction_op_type induction_type)
9208 {
9209   unsigned HOST_WIDE_INT const_nunits;
9210   tree vec_shift, vec_init, new_name;
9211   unsigned i;
9212   tree itype = TREE_TYPE (vectype);
9213
9214   /* iv_loop is the loop to be vectorized. Create:
9215      vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr).  */
9216   new_name = gimple_convert (stmts, itype, init_expr);
9217   switch (induction_type)
9218     {
9219     case vect_step_op_shr:
9220     case vect_step_op_shl:
9221       /* Build the Initial value from shift_expr.  */
9222       vec_init = gimple_build_vector_from_val (stmts,
9223                                                vectype,
9224                                                new_name);
9225       vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
9226                                 build_zero_cst (itype), step_expr);
9227       vec_init = gimple_build (stmts,
9228                                (induction_type == vect_step_op_shr
9229                                 ? RSHIFT_EXPR : LSHIFT_EXPR),
9230                                vectype, vec_init, vec_shift);
9231       break;
9232
9233     case vect_step_op_neg:
9234       {
9235         vec_init = gimple_build_vector_from_val (stmts,
9236                                                  vectype,
9237                                                  new_name);
9238         tree vec_neg = gimple_build (stmts, NEGATE_EXPR,
9239                                      vectype, vec_init);
9240         /* The encoding has 2 interleaved stepped patterns.  */
9241         vec_perm_builder sel (nunits, 2, 3);
9242         sel.quick_grow (6);
9243         for (i = 0; i < 3; i++)
9244           {
9245             sel[2 * i] = i;
9246             sel[2 * i + 1] = i + nunits;
9247           }
9248         vec_perm_indices indices (sel, 2, nunits);
9249         /* Don't use vect_gen_perm_mask_checked since can_vec_perm_const_p may
9250            fail when vec_init is const vector. In that situation vec_perm is not
9251            really needed.  */
9252         tree perm_mask_even
9253           = vect_gen_perm_mask_any (vectype, indices);
9254         vec_init = gimple_build (stmts, VEC_PERM_EXPR,
9255                                  vectype,
9256                                  vec_init, vec_neg,
9257                                  perm_mask_even);
9258       }
9259       break;
9260
9261     case vect_step_op_mul:
9262       {
9263         /* Use unsigned mult to avoid UD integer overflow.  */
9264         gcc_assert (nunits.is_constant (&const_nunits));
9265         tree utype = unsigned_type_for (itype);
9266         tree uvectype = build_vector_type (utype,
9267                                            TYPE_VECTOR_SUBPARTS (vectype));
9268         new_name = gimple_convert (stmts, utype, new_name);
9269         vec_init = gimple_build_vector_from_val (stmts,
9270                                                  uvectype,
9271                                                  new_name);
9272         tree_vector_builder elts (uvectype, const_nunits, 1);
9273         tree elt_step = build_one_cst (utype);
9274
9275         elts.quick_push (elt_step);
9276         for (i = 1; i < const_nunits; i++)
9277           {
9278             /* Create: new_name_i = new_name + step_expr.  */
9279             elt_step = gimple_build (stmts, MULT_EXPR,
9280                                      utype, elt_step, step_expr);
9281             elts.quick_push (elt_step);
9282           }
9283         /* Create a vector from [new_name_0, new_name_1, ...,
9284            new_name_nunits-1].  */
9285         tree vec_mul = gimple_build_vector (stmts, &elts);
9286         vec_init = gimple_build (stmts, MULT_EXPR, uvectype,
9287                                  vec_init, vec_mul);
9288         vec_init = gimple_convert (stmts, vectype, vec_init);
9289       }
9290       break;
9291
9292     default:
9293       gcc_unreachable ();
9294     }
9295
9296   return vec_init;
9297 }
9298
9299 /* Peel init_expr by skip_niter for induction_type.  */
9300 tree
9301 vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9302                              tree skip_niters, tree step_expr,
9303                              enum vect_induction_op_type induction_type)
9304 {
9305   gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST);
9306   tree type = TREE_TYPE (init_expr);
9307   unsigned prec = TYPE_PRECISION (type);
9308   switch (induction_type)
9309     {
9310     case vect_step_op_neg:
9311       if (TREE_INT_CST_LOW (skip_niters) % 2)
9312         init_expr = gimple_build (stmts, NEGATE_EXPR, type, init_expr);
9313       /* else no change.  */
9314       break;
9315
9316     case vect_step_op_shr:
9317     case vect_step_op_shl:
9318       skip_niters = gimple_convert (stmts, type, skip_niters);
9319       step_expr = gimple_build (stmts, MULT_EXPR, type, step_expr, skip_niters);
9320       /* When shift mount >= precision, need to avoid UD.
9321          In the original loop, there's no UD, and according to semantic,
9322          init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr.  */
9323       if (!tree_fits_uhwi_p (step_expr)
9324           || tree_to_uhwi (step_expr) >= prec)
9325         {
9326           if (induction_type == vect_step_op_shl
9327               || TYPE_UNSIGNED (type))
9328             init_expr = build_zero_cst (type);
9329           else
9330             init_expr = gimple_build (stmts, RSHIFT_EXPR, type,
9331                                       init_expr,
9332                                       wide_int_to_tree (type, prec - 1));
9333         }
9334       else
9335         init_expr = gimple_build (stmts, (induction_type == vect_step_op_shr
9336                                           ? RSHIFT_EXPR : LSHIFT_EXPR),
9337                                   type, init_expr, step_expr);
9338       break;
9339
9340     case vect_step_op_mul:
9341       {
9342         tree utype = unsigned_type_for (type);
9343         init_expr = gimple_convert (stmts, utype, init_expr);
9344         wide_int skipn = wi::to_wide (skip_niters);
9345         wide_int begin = wi::to_wide (step_expr);
9346         auto_mpz base, exp, mod, res;
9347         wi::to_mpz (begin, base, TYPE_SIGN (type));
9348         wi::to_mpz (skipn, exp, UNSIGNED);
9349         mpz_ui_pow_ui (mod, 2, TYPE_PRECISION (type));
9350         mpz_powm (res, base, exp, mod);
9351         begin = wi::from_mpz (type, res, TYPE_SIGN (type));
9352         tree mult_expr = wide_int_to_tree (utype, begin);
9353         init_expr = gimple_build (stmts, MULT_EXPR, utype,
9354                                   init_expr, mult_expr);
9355         init_expr = gimple_convert (stmts, type, init_expr);
9356       }
9357       break;
9358
9359     default:
9360       gcc_unreachable ();
9361     }
9362
9363   return init_expr;
9364 }
9365
9366 /* Create vector step for vectorized iv.  */
9367 static tree
9368 vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
9369                                poly_uint64 vf,
9370                                enum vect_induction_op_type induction_type)
9371 {
9372   tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
9373   tree new_name = NULL;
9374   /* Step should be pow (step, vf) for mult induction.  */
9375   if (induction_type == vect_step_op_mul)
9376     {
9377       gcc_assert (vf.is_constant ());
9378       wide_int begin = wi::to_wide (step_expr);
9379
9380       for (unsigned i = 0; i != vf.to_constant () - 1; i++)
9381         begin = wi::mul (begin, wi::to_wide (step_expr));
9382
9383       new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin);
9384     }
9385   else if (induction_type == vect_step_op_neg)
9386     /* Do nothing.  */
9387     ;
9388   else
9389     new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr),
9390                              expr, step_expr);
9391   return new_name;
9392 }
9393
9394 static tree
9395 vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
9396                                    stmt_vec_info stmt_info,
9397                                    tree new_name, tree vectype,
9398                                    enum vect_induction_op_type induction_type)
9399 {
9400   /* No step is needed for neg induction.  */
9401   if (induction_type == vect_step_op_neg)
9402     return NULL;
9403
9404   tree t = unshare_expr (new_name);
9405   gcc_assert (CONSTANT_CLASS_P (new_name)
9406               || TREE_CODE (new_name) == SSA_NAME);
9407   tree new_vec = build_vector_from_val (vectype, t);
9408   tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
9409                                     new_vec, vectype, NULL);
9410   return vec_step;
9411 }
9412
9413 /* Update vectorized iv with vect_step, induc_def is init.  */
9414 static tree
9415 vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
9416                           tree induc_def, tree vec_step,
9417                           enum vect_induction_op_type induction_type)
9418 {
9419   tree vec_def = induc_def;
9420   switch (induction_type)
9421     {
9422     case vect_step_op_mul:
9423       {
9424         /* Use unsigned mult to avoid UD integer overflow.  */
9425         tree uvectype
9426           = build_vector_type (unsigned_type_for (TREE_TYPE (vectype)),
9427                                TYPE_VECTOR_SUBPARTS (vectype));
9428         vec_def = gimple_convert (stmts, uvectype, vec_def);
9429         vec_step = gimple_convert (stmts, uvectype, vec_step);
9430         vec_def = gimple_build (stmts, MULT_EXPR, uvectype,
9431                                 vec_def, vec_step);
9432         vec_def = gimple_convert (stmts, vectype, vec_def);
9433       }
9434       break;
9435
9436     case vect_step_op_shr:
9437       vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype,
9438                               vec_def, vec_step);
9439       break;
9440
9441     case vect_step_op_shl:
9442       vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype,
9443                               vec_def, vec_step);
9444       break;
9445     case vect_step_op_neg:
9446       vec_def = induc_def;
9447       /* Do nothing.  */
9448       break;
9449     default:
9450       gcc_unreachable ();
9451     }
9452
9453   return vec_def;
9454
9455 }
9456
9457 /* Function vectorizable_induction
9458
9459    Check if STMT_INFO performs an nonlinear induction computation that can be
9460    vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
9461    a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
9462    basic block.
9463    Return true if STMT_INFO is vectorizable in this way.  */
9464
9465 static bool
9466 vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
9467                                   stmt_vec_info stmt_info,
9468                                   gimple **vec_stmt, slp_tree slp_node,
9469                                   stmt_vector_for_cost *cost_vec)
9470 {
9471   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9472   unsigned ncopies;
9473   bool nested_in_vect_loop = false;
9474   class loop *iv_loop;
9475   tree vec_def;
9476   edge pe = loop_preheader_edge (loop);
9477   basic_block new_bb;
9478   tree vec_init, vec_step;
9479   tree new_name;
9480   gimple *new_stmt;
9481   gphi *induction_phi;
9482   tree induc_def, vec_dest;
9483   tree init_expr, step_expr;
9484   tree niters_skip;
9485   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9486   unsigned i;
9487   gimple_stmt_iterator si;
9488
9489   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9490
9491   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9492   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9493   enum vect_induction_op_type induction_type
9494     = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9495
9496   gcc_assert (induction_type > vect_step_op_add);
9497
9498   if (slp_node)
9499     ncopies = 1;
9500   else
9501     ncopies = vect_get_num_copies (loop_vinfo, vectype);
9502   gcc_assert (ncopies >= 1);
9503
9504   /* FORNOW. Only handle nonlinear induction in the same loop.  */
9505   if (nested_in_vect_loop_p (loop, stmt_info))
9506     {
9507       if (dump_enabled_p ())
9508         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9509                          "nonlinear induction in nested loop.\n");
9510       return false;
9511     }
9512
9513   iv_loop = loop;
9514   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9515
9516   /* TODO: Support slp for nonlinear iv. There should be separate vector iv
9517      update for each iv and a permutation to generate wanted vector iv.  */
9518   if (slp_node)
9519     {
9520       if (dump_enabled_p ())
9521         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9522                          "SLP induction not supported for nonlinear"
9523                          " induction.\n");
9524       return false;
9525     }
9526
9527   if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
9528     {
9529       if (dump_enabled_p ())
9530         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9531                          "floating point nonlinear induction vectorization"
9532                          " not supported.\n");
9533       return false;
9534     }
9535
9536   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9537   init_expr = vect_phi_initial_value (phi);
9538   gcc_assert (step_expr != NULL_TREE && init_expr != NULL
9539               && TREE_CODE (step_expr) == INTEGER_CST);
9540   /* step_expr should be aligned with init_expr,
9541      .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used.  */
9542   step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
9543
9544   if (TREE_CODE (init_expr) == INTEGER_CST)
9545     init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
9546   else
9547     gcc_assert (tree_nop_conversion_p (TREE_TYPE (vectype),
9548                                        TREE_TYPE (init_expr)));
9549
9550   switch (induction_type)
9551     {
9552     case vect_step_op_neg:
9553       if (TREE_CODE (init_expr) != INTEGER_CST
9554           && TREE_CODE (init_expr) != REAL_CST)
9555         {
9556           /* Check for backend support of NEGATE_EXPR and vec_perm.  */
9557           if (!directly_supported_p (NEGATE_EXPR, vectype))
9558             return false;
9559
9560           /* The encoding has 2 interleaved stepped patterns.  */
9561           vec_perm_builder sel (nunits, 2, 3);
9562           machine_mode mode = TYPE_MODE (vectype);
9563           sel.quick_grow (6);
9564           for (i = 0; i < 3; i++)
9565             {
9566               sel[i * 2] = i;
9567               sel[i * 2 + 1] = i + nunits;
9568             }
9569           vec_perm_indices indices (sel, 2, nunits);
9570           if (!can_vec_perm_const_p (mode, mode, indices))
9571             return false;
9572         }
9573       break;
9574
9575     case vect_step_op_mul:
9576       {
9577         /* Check for backend support of MULT_EXPR.  */
9578         if (!directly_supported_p (MULT_EXPR, vectype))
9579           return false;
9580
9581         /* ?? How to construct vector step for variable number vector.
9582            [ 1, step, pow (step, 2), pow (step, 4), .. ].  */
9583         if (!vf.is_constant ())
9584           return false;
9585       }
9586       break;
9587
9588     case vect_step_op_shr:
9589       /* Check for backend support of RSHIFT_EXPR.  */
9590       if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
9591         return false;
9592
9593       /* Don't shift more than type precision to avoid UD.  */
9594       if (!tree_fits_uhwi_p (step_expr)
9595           || maybe_ge (nunits * tree_to_uhwi (step_expr),
9596                        TYPE_PRECISION (TREE_TYPE (init_expr))))
9597         return false;
9598       break;
9599
9600     case vect_step_op_shl:
9601       /* Check for backend support of RSHIFT_EXPR.  */
9602       if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
9603         return false;
9604
9605       /* Don't shift more than type precision to avoid UD.  */
9606       if (!tree_fits_uhwi_p (step_expr)
9607           || maybe_ge (nunits * tree_to_uhwi (step_expr),
9608                        TYPE_PRECISION (TREE_TYPE (init_expr))))
9609         return false;
9610
9611       break;
9612
9613     default:
9614       gcc_unreachable ();
9615     }
9616
9617   if (!vec_stmt) /* transformation not required.  */
9618     {
9619       unsigned inside_cost = 0, prologue_cost = 0;
9620       /* loop cost for vec_loop. Neg induction doesn't have any
9621          inside_cost.  */
9622       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9623                                       stmt_info, 0, vect_body);
9624
9625       /* loop cost for vec_loop. Neg induction doesn't have any
9626          inside_cost.  */
9627       if (induction_type == vect_step_op_neg)
9628         inside_cost = 0;
9629
9630       /* prologue cost for vec_init and vec_step.  */
9631       prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9632                                         stmt_info, 0, vect_prologue);
9633
9634       if (dump_enabled_p ())
9635         dump_printf_loc (MSG_NOTE, vect_location,
9636                          "vect_model_induction_cost: inside_cost = %d, "
9637                          "prologue_cost = %d. \n", inside_cost,
9638                          prologue_cost);
9639
9640       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9641       DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
9642       return true;
9643     }
9644
9645   /* Transform.  */
9646
9647   /* Compute a vector variable, initialized with the first VF values of
9648      the induction variable.  E.g., for an iv with IV_PHI='X' and
9649      evolution S, for a vector of 4 units, we want to compute:
9650      [X, X + S, X + 2*S, X + 3*S].  */
9651
9652   if (dump_enabled_p ())
9653     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9654
9655   pe = loop_preheader_edge (iv_loop);
9656   /* Find the first insertion point in the BB.  */
9657   basic_block bb = gimple_bb (phi);
9658   si = gsi_after_labels (bb);
9659
9660   gimple_seq stmts = NULL;
9661
9662   niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9663   /* If we are using the loop mask to "peel" for alignment then we need
9664      to adjust the start value here.  */
9665   if (niters_skip != NULL_TREE)
9666     init_expr = vect_peel_nonlinear_iv_init (&stmts, init_expr, niters_skip,
9667                                              step_expr, induction_type);
9668
9669   vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr,
9670                                             step_expr, nunits, vectype,
9671                                             induction_type);
9672   if (stmts)
9673     {
9674       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9675       gcc_assert (!new_bb);
9676     }
9677
9678   stmts = NULL;
9679   new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9680                                             vf, induction_type);
9681   if (stmts)
9682     {
9683       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9684       gcc_assert (!new_bb);
9685     }
9686
9687   vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9688                                                 new_name, vectype,
9689                                                 induction_type);
9690   /* Create the following def-use cycle:
9691      loop prolog:
9692      vec_init = ...
9693      vec_step = ...
9694      loop:
9695      vec_iv = PHI <vec_init, vec_loop>
9696      ...
9697      STMT
9698      ...
9699      vec_loop = vec_iv + vec_step;  */
9700
9701   /* Create the induction-phi that defines the induction-operand.  */
9702   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
9703   induction_phi = create_phi_node (vec_dest, iv_loop->header);
9704   induc_def = PHI_RESULT (induction_phi);
9705
9706   /* Create the iv update inside the loop.  */
9707   stmts = NULL;
9708   vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9709                                       induc_def, vec_step,
9710                                       induction_type);
9711
9712   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9713   new_stmt = SSA_NAME_DEF_STMT (vec_def);
9714
9715   /* Set the arguments of the phi node:  */
9716   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9717   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9718                UNKNOWN_LOCATION);
9719
9720   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
9721   *vec_stmt = induction_phi;
9722
9723   /* In case that vectorization factor (VF) is bigger than the number
9724      of elements that we can fit in a vectype (nunits), we have to generate
9725      more than one vector stmt - i.e - we need to "unroll" the
9726      vector stmt by a factor VF/nunits.  For more details see documentation
9727      in vectorizable_operation.  */
9728
9729   if (ncopies > 1)
9730     {
9731       stmts = NULL;
9732       /* FORNOW. This restriction should be relaxed.  */
9733       gcc_assert (!nested_in_vect_loop);
9734
9735       new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9736                                                 nunits, induction_type);
9737
9738       vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9739                                                     new_name, vectype,
9740                                                     induction_type);
9741       vec_def = induc_def;
9742       for (i = 1; i < ncopies; i++)
9743         {
9744           /* vec_i = vec_prev + vec_step.  */
9745           stmts = NULL;
9746           vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9747                                               vec_def, vec_step,
9748                                               induction_type);
9749           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9750           new_stmt = SSA_NAME_DEF_STMT (vec_def);
9751           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
9752         }
9753     }
9754
9755   if (dump_enabled_p ())
9756     dump_printf_loc (MSG_NOTE, vect_location,
9757                      "transform induction: created def-use cycle: %G%G",
9758                      (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
9759
9760   return true;
9761 }
9762
9763 /* Function vectorizable_induction
9764
9765    Check if STMT_INFO performs an induction computation that can be vectorized.
9766    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
9767    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
9768    Return true if STMT_INFO is vectorizable in this way.  */
9769
9770 bool
9771 vectorizable_induction (loop_vec_info loop_vinfo,
9772                         stmt_vec_info stmt_info,
9773                         gimple **vec_stmt, slp_tree slp_node,
9774                         stmt_vector_for_cost *cost_vec)
9775 {
9776   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9777   unsigned ncopies;
9778   bool nested_in_vect_loop = false;
9779   class loop *iv_loop;
9780   tree vec_def;
9781   edge pe = loop_preheader_edge (loop);
9782   basic_block new_bb;
9783   tree new_vec, vec_init, vec_step, t;
9784   tree new_name;
9785   gimple *new_stmt;
9786   gphi *induction_phi;
9787   tree induc_def, vec_dest;
9788   tree init_expr, step_expr;
9789   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9790   unsigned i;
9791   tree expr;
9792   gimple_stmt_iterator si;
9793   enum vect_induction_op_type induction_type
9794     = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9795
9796   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9797   if (!phi)
9798     return false;
9799
9800   if (!STMT_VINFO_RELEVANT_P (stmt_info))
9801     return false;
9802
9803   /* Make sure it was recognized as induction computation.  */
9804   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
9805     return false;
9806
9807   /* Handle nonlinear induction in a separate place.  */
9808   if (induction_type != vect_step_op_add)
9809     return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
9810                                              vec_stmt, slp_node, cost_vec);
9811
9812   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9813   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9814
9815   if (slp_node)
9816     ncopies = 1;
9817   else
9818     ncopies = vect_get_num_copies (loop_vinfo, vectype);
9819   gcc_assert (ncopies >= 1);
9820
9821   /* FORNOW. These restrictions should be relaxed.  */
9822   if (nested_in_vect_loop_p (loop, stmt_info))
9823     {
9824       imm_use_iterator imm_iter;
9825       use_operand_p use_p;
9826       gimple *exit_phi;
9827       edge latch_e;
9828       tree loop_arg;
9829
9830       if (ncopies > 1)
9831         {
9832           if (dump_enabled_p ())
9833             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9834                              "multiple types in nested loop.\n");
9835           return false;
9836         }
9837
9838       exit_phi = NULL;
9839       latch_e = loop_latch_edge (loop->inner);
9840       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
9841       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
9842         {
9843           gimple *use_stmt = USE_STMT (use_p);
9844           if (is_gimple_debug (use_stmt))
9845             continue;
9846
9847           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
9848             {
9849               exit_phi = use_stmt;
9850               break;
9851             }
9852         }
9853       if (exit_phi)
9854         {
9855           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
9856           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
9857                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
9858             {
9859               if (dump_enabled_p ())
9860                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9861                                  "inner-loop induction only used outside "
9862                                  "of the outer vectorized loop.\n");
9863               return false;
9864             }
9865         }
9866
9867       nested_in_vect_loop = true;
9868       iv_loop = loop->inner;
9869     }
9870   else
9871     iv_loop = loop;
9872   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9873
9874   if (slp_node && !nunits.is_constant ())
9875     {
9876       /* The current SLP code creates the step value element-by-element.  */
9877       if (dump_enabled_p ())
9878         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9879                          "SLP induction not supported for variable-length"
9880                          " vectors.\n");
9881       return false;
9882     }
9883
9884   if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
9885     {
9886       if (dump_enabled_p ())
9887         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9888                          "floating point induction vectorization disabled\n");
9889       return false;
9890     }
9891
9892   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9893   gcc_assert (step_expr != NULL_TREE);
9894   tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
9895
9896   /* Check for backend support of PLUS/MINUS_EXPR. */
9897   if (!directly_supported_p (PLUS_EXPR, step_vectype)
9898       || !directly_supported_p (MINUS_EXPR, step_vectype))
9899     return false;
9900
9901   if (!vec_stmt) /* transformation not required.  */
9902     {
9903       unsigned inside_cost = 0, prologue_cost = 0;
9904       if (slp_node)
9905         {
9906           /* We eventually need to set a vector type on invariant
9907              arguments.  */
9908           unsigned j;
9909           slp_tree child;
9910           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9911             if (!vect_maybe_update_slp_op_vectype
9912                 (child, SLP_TREE_VECTYPE (slp_node)))
9913               {
9914                 if (dump_enabled_p ())
9915                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9916                                    "incompatible vector types for "
9917                                    "invariants\n");
9918                 return false;
9919               }
9920           /* loop cost for vec_loop.  */
9921           inside_cost
9922             = record_stmt_cost (cost_vec,
9923                                 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
9924                                 vector_stmt, stmt_info, 0, vect_body);
9925           /* prologue cost for vec_init (if not nested) and step.  */
9926           prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
9927                                             scalar_to_vec,
9928                                             stmt_info, 0, vect_prologue);
9929         }
9930       else /* if (!slp_node) */
9931         {
9932           /* loop cost for vec_loop.  */
9933           inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9934                                           stmt_info, 0, vect_body);
9935           /* prologue cost for vec_init and vec_step.  */
9936           prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9937                                             stmt_info, 0, vect_prologue);
9938         }
9939       if (dump_enabled_p ())
9940         dump_printf_loc (MSG_NOTE, vect_location,
9941                          "vect_model_induction_cost: inside_cost = %d, "
9942                          "prologue_cost = %d .\n", inside_cost,
9943                          prologue_cost);
9944
9945       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9946       DUMP_VECT_SCOPE ("vectorizable_induction");
9947       return true;
9948     }
9949
9950   /* Transform.  */
9951
9952   /* Compute a vector variable, initialized with the first VF values of
9953      the induction variable.  E.g., for an iv with IV_PHI='X' and
9954      evolution S, for a vector of 4 units, we want to compute:
9955      [X, X + S, X + 2*S, X + 3*S].  */
9956
9957   if (dump_enabled_p ())
9958     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9959
9960   pe = loop_preheader_edge (iv_loop);
9961   /* Find the first insertion point in the BB.  */
9962   basic_block bb = gimple_bb (phi);
9963   si = gsi_after_labels (bb);
9964
9965   /* For SLP induction we have to generate several IVs as for example
9966      with group size 3 we need
9967        [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
9968        [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2].  */
9969   if (slp_node)
9970     {
9971       /* Enforced above.  */
9972       unsigned int const_nunits = nunits.to_constant ();
9973
9974       /* The initial values are vectorized, but any lanes > group_size
9975          need adjustment.  */
9976       slp_tree init_node
9977         = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
9978
9979       /* Gather steps.  Since we do not vectorize inductions as
9980          cycles we have to reconstruct the step from SCEV data.  */
9981       unsigned group_size = SLP_TREE_LANES (slp_node);
9982       tree *steps = XALLOCAVEC (tree, group_size);
9983       tree *inits = XALLOCAVEC (tree, group_size);
9984       stmt_vec_info phi_info;
9985       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
9986         {
9987           steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
9988           if (!init_node)
9989             inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
9990                                            pe->dest_idx);
9991         }
9992
9993       /* Now generate the IVs.  */
9994       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
9995       gcc_assert ((const_nunits * nvects) % group_size == 0);
9996       unsigned nivs;
9997       if (nested_in_vect_loop)
9998         nivs = nvects;
9999       else
10000         {
10001           /* Compute the number of distinct IVs we need.  First reduce
10002              group_size if it is a multiple of const_nunits so we get
10003              one IV for a group_size of 4 but const_nunits 2.  */
10004           unsigned group_sizep = group_size;
10005           if (group_sizep % const_nunits == 0)
10006             group_sizep = group_sizep / const_nunits;
10007           nivs = least_common_multiple (group_sizep,
10008                                         const_nunits) / const_nunits;
10009         }
10010       tree stept = TREE_TYPE (step_vectype);
10011       tree lupdate_mul = NULL_TREE;
10012       if (!nested_in_vect_loop)
10013         {
10014           /* The number of iterations covered in one vector iteration.  */
10015           unsigned lup_mul = (nvects * const_nunits) / group_size;
10016           lupdate_mul
10017             = build_vector_from_val (step_vectype,
10018                                      SCALAR_FLOAT_TYPE_P (stept)
10019                                      ? build_real_from_wide (stept, lup_mul,
10020                                                              UNSIGNED)
10021                                      : build_int_cstu (stept, lup_mul));
10022         }
10023       tree peel_mul = NULL_TREE;
10024       gimple_seq init_stmts = NULL;
10025       if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
10026         {
10027           if (SCALAR_FLOAT_TYPE_P (stept))
10028             peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
10029                                      LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
10030           else
10031             peel_mul = gimple_convert (&init_stmts, stept,
10032                                        LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
10033           peel_mul = gimple_build_vector_from_val (&init_stmts,
10034                                                    step_vectype, peel_mul);
10035         }
10036       unsigned ivn;
10037       auto_vec<tree> vec_steps;
10038       for (ivn = 0; ivn < nivs; ++ivn)
10039         {
10040           tree_vector_builder step_elts (step_vectype, const_nunits, 1);
10041           tree_vector_builder init_elts (vectype, const_nunits, 1);
10042           tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
10043           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
10044             {
10045               /* The scalar steps of the IVs.  */
10046               tree elt = steps[(ivn*const_nunits + eltn) % group_size];
10047               elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
10048               step_elts.quick_push (elt);
10049               if (!init_node)
10050                 {
10051                   /* The scalar inits of the IVs if not vectorized.  */
10052                   elt = inits[(ivn*const_nunits + eltn) % group_size];
10053                   if (!useless_type_conversion_p (TREE_TYPE (vectype),
10054                                                   TREE_TYPE (elt)))
10055                     elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
10056                                         TREE_TYPE (vectype), elt);
10057                   init_elts.quick_push (elt);
10058                 }
10059               /* The number of steps to add to the initial values.  */
10060               unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
10061               mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
10062                                    ? build_real_from_wide (stept,
10063                                                            mul_elt, UNSIGNED)
10064                                    : build_int_cstu (stept, mul_elt));
10065             }
10066           vec_step = gimple_build_vector (&init_stmts, &step_elts);
10067           vec_steps.safe_push (vec_step);
10068           tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
10069           if (peel_mul)
10070             step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
10071                                      step_mul, peel_mul);
10072           if (!init_node)
10073             vec_init = gimple_build_vector (&init_stmts, &init_elts);
10074
10075           /* Create the induction-phi that defines the induction-operand.  */
10076           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
10077                                             "vec_iv_");
10078           induction_phi = create_phi_node (vec_dest, iv_loop->header);
10079           induc_def = PHI_RESULT (induction_phi);
10080
10081           /* Create the iv update inside the loop  */
10082           tree up = vec_step;
10083           if (lupdate_mul)
10084             up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10085                                vec_step, lupdate_mul);
10086           gimple_seq stmts = NULL;
10087           vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10088           vec_def = gimple_build (&stmts,
10089                                   PLUS_EXPR, step_vectype, vec_def, up);
10090           vec_def = gimple_convert (&stmts, vectype, vec_def);
10091           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10092           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10093                        UNKNOWN_LOCATION);
10094
10095           if (init_node)
10096             vec_init = vect_get_slp_vect_def (init_node, ivn);
10097           if (!nested_in_vect_loop
10098               && !integer_zerop (step_mul))
10099             {
10100               vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
10101               up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10102                                  vec_step, step_mul);
10103               vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
10104                                       vec_def, up);
10105               vec_init = gimple_convert (&init_stmts, vectype, vec_def);
10106             }
10107
10108           /* Set the arguments of the phi node:  */
10109           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10110
10111           slp_node->push_vec_def (induction_phi);
10112         }
10113       if (!nested_in_vect_loop)
10114         {
10115           /* Fill up to the number of vectors we need for the whole group.  */
10116           nivs = least_common_multiple (group_size,
10117                                         const_nunits) / const_nunits;
10118           vec_steps.reserve (nivs-ivn);
10119           for (; ivn < nivs; ++ivn)
10120             {
10121               slp_node->push_vec_def (SLP_TREE_VEC_DEFS (slp_node)[0]);
10122               vec_steps.quick_push (vec_steps[0]);
10123             }
10124         }
10125
10126       /* Re-use IVs when we can.  We are generating further vector
10127          stmts by adding VF' * stride to the IVs generated above.  */
10128       if (ivn < nvects)
10129         {
10130           unsigned vfp
10131             = least_common_multiple (group_size, const_nunits) / group_size;
10132           tree lupdate_mul
10133             = build_vector_from_val (step_vectype,
10134                                      SCALAR_FLOAT_TYPE_P (stept)
10135                                      ? build_real_from_wide (stept,
10136                                                              vfp, UNSIGNED)
10137                                      : build_int_cstu (stept, vfp));
10138           for (; ivn < nvects; ++ivn)
10139             {
10140               gimple *iv
10141                 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (slp_node)[ivn - nivs]);
10142               tree def = gimple_get_lhs (iv);
10143               if (ivn < 2*nivs)
10144                 vec_steps[ivn - nivs]
10145                   = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10146                                   vec_steps[ivn - nivs], lupdate_mul);
10147               gimple_seq stmts = NULL;
10148               def = gimple_convert (&stmts, step_vectype, def);
10149               def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10150                                   def, vec_steps[ivn % nivs]);
10151               def = gimple_convert (&stmts, vectype, def);
10152               if (gimple_code (iv) == GIMPLE_PHI)
10153                 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10154               else
10155                 {
10156                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
10157                   gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
10158                 }
10159               slp_node->push_vec_def (def);
10160             }
10161         }
10162
10163       new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
10164       gcc_assert (!new_bb);
10165
10166       return true;
10167     }
10168
10169   init_expr = vect_phi_initial_value (phi);
10170
10171   gimple_seq stmts = NULL;
10172   if (!nested_in_vect_loop)
10173     {
10174       /* Convert the initial value to the IV update type.  */
10175       tree new_type = TREE_TYPE (step_expr);
10176       init_expr = gimple_convert (&stmts, new_type, init_expr);
10177
10178       /* If we are using the loop mask to "peel" for alignment then we need
10179          to adjust the start value here.  */
10180       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
10181       if (skip_niters != NULL_TREE)
10182         {
10183           if (FLOAT_TYPE_P (vectype))
10184             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
10185                                         skip_niters);
10186           else
10187             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
10188           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
10189                                          skip_niters, step_expr);
10190           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
10191                                     init_expr, skip_step);
10192         }
10193     }
10194
10195   if (stmts)
10196     {
10197       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10198       gcc_assert (!new_bb);
10199     }
10200
10201   /* Create the vector that holds the initial_value of the induction.  */
10202   if (nested_in_vect_loop)
10203     {
10204       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
10205          been created during vectorization of previous stmts.  We obtain it
10206          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
10207       auto_vec<tree> vec_inits;
10208       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
10209                                      init_expr, &vec_inits);
10210       vec_init = vec_inits[0];
10211       /* If the initial value is not of proper type, convert it.  */
10212       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
10213         {
10214           new_stmt
10215             = gimple_build_assign (vect_get_new_ssa_name (vectype,
10216                                                           vect_simple_var,
10217                                                           "vec_iv_"),
10218                                    VIEW_CONVERT_EXPR,
10219                                    build1 (VIEW_CONVERT_EXPR, vectype,
10220                                            vec_init));
10221           vec_init = gimple_assign_lhs (new_stmt);
10222           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
10223                                                  new_stmt);
10224           gcc_assert (!new_bb);
10225         }
10226     }
10227   else
10228     {
10229       /* iv_loop is the loop to be vectorized. Create:
10230          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
10231       stmts = NULL;
10232       new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
10233
10234       unsigned HOST_WIDE_INT const_nunits;
10235       if (nunits.is_constant (&const_nunits))
10236         {
10237           tree_vector_builder elts (step_vectype, const_nunits, 1);
10238           elts.quick_push (new_name);
10239           for (i = 1; i < const_nunits; i++)
10240             {
10241               /* Create: new_name_i = new_name + step_expr  */
10242               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
10243                                        new_name, step_expr);
10244               elts.quick_push (new_name);
10245             }
10246           /* Create a vector from [new_name_0, new_name_1, ...,
10247              new_name_nunits-1]  */
10248           vec_init = gimple_build_vector (&stmts, &elts);
10249         }
10250       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
10251         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
10252         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
10253                                  new_name, step_expr);
10254       else
10255         {
10256           /* Build:
10257                 [base, base, base, ...]
10258                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
10259           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
10260           gcc_assert (flag_associative_math);
10261           tree index = build_index_vector (step_vectype, 0, 1);
10262           tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10263                                                         new_name);
10264           tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10265                                                         step_expr);
10266           vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
10267           vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
10268                                    vec_init, step_vec);
10269           vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10270                                    vec_init, base_vec);
10271         }
10272       vec_init = gimple_convert (&stmts, vectype, vec_init);
10273
10274       if (stmts)
10275         {
10276           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10277           gcc_assert (!new_bb);
10278         }
10279     }
10280
10281
10282   /* Create the vector that holds the step of the induction.  */
10283   if (nested_in_vect_loop)
10284     /* iv_loop is nested in the loop to be vectorized. Generate:
10285        vec_step = [S, S, S, S]  */
10286     new_name = step_expr;
10287   else
10288     {
10289       /* iv_loop is the loop to be vectorized. Generate:
10290           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
10291       gimple_seq seq = NULL;
10292       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10293         {
10294           expr = build_int_cst (integer_type_node, vf);
10295           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10296         }
10297       else
10298         expr = build_int_cst (TREE_TYPE (step_expr), vf);
10299       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10300                                expr, step_expr);
10301       if (seq)
10302         {
10303           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10304           gcc_assert (!new_bb);
10305         }
10306     }
10307
10308   t = unshare_expr (new_name);
10309   gcc_assert (CONSTANT_CLASS_P (new_name)
10310               || TREE_CODE (new_name) == SSA_NAME);
10311   new_vec = build_vector_from_val (step_vectype, t);
10312   vec_step = vect_init_vector (loop_vinfo, stmt_info,
10313                                new_vec, step_vectype, NULL);
10314
10315
10316   /* Create the following def-use cycle:
10317      loop prolog:
10318          vec_init = ...
10319          vec_step = ...
10320      loop:
10321          vec_iv = PHI <vec_init, vec_loop>
10322          ...
10323          STMT
10324          ...
10325          vec_loop = vec_iv + vec_step;  */
10326
10327   /* Create the induction-phi that defines the induction-operand.  */
10328   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
10329   induction_phi = create_phi_node (vec_dest, iv_loop->header);
10330   induc_def = PHI_RESULT (induction_phi);
10331
10332   /* Create the iv update inside the loop  */
10333   stmts = NULL;
10334   vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10335   vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
10336   vec_def = gimple_convert (&stmts, vectype, vec_def);
10337   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10338   new_stmt = SSA_NAME_DEF_STMT (vec_def);
10339
10340   /* Set the arguments of the phi node:  */
10341   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10342   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10343                UNKNOWN_LOCATION);
10344
10345   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
10346   *vec_stmt = induction_phi;
10347
10348   /* In case that vectorization factor (VF) is bigger than the number
10349      of elements that we can fit in a vectype (nunits), we have to generate
10350      more than one vector stmt - i.e - we need to "unroll" the
10351      vector stmt by a factor VF/nunits.  For more details see documentation
10352      in vectorizable_operation.  */
10353
10354   if (ncopies > 1)
10355     {
10356       gimple_seq seq = NULL;
10357       /* FORNOW. This restriction should be relaxed.  */
10358       gcc_assert (!nested_in_vect_loop);
10359
10360       /* Create the vector that holds the step of the induction.  */
10361       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10362         {
10363           expr = build_int_cst (integer_type_node, nunits);
10364           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10365         }
10366       else
10367         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
10368       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10369                                expr, step_expr);
10370       if (seq)
10371         {
10372           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10373           gcc_assert (!new_bb);
10374         }
10375
10376       t = unshare_expr (new_name);
10377       gcc_assert (CONSTANT_CLASS_P (new_name)
10378                   || TREE_CODE (new_name) == SSA_NAME);
10379       new_vec = build_vector_from_val (step_vectype, t);
10380       vec_step = vect_init_vector (loop_vinfo, stmt_info,
10381                                    new_vec, step_vectype, NULL);
10382
10383       vec_def = induc_def;
10384       for (i = 1; i < ncopies + 1; i++)
10385         {
10386           /* vec_i = vec_prev + vec_step  */
10387           gimple_seq stmts = NULL;
10388           vec_def = gimple_convert (&stmts, step_vectype, vec_def);
10389           vec_def = gimple_build (&stmts,
10390                                   PLUS_EXPR, step_vectype, vec_def, vec_step);
10391           vec_def = gimple_convert (&stmts, vectype, vec_def);
10392
10393           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10394           if (i < ncopies)
10395             {
10396               new_stmt = SSA_NAME_DEF_STMT (vec_def);
10397               STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
10398             }
10399           else
10400             {
10401               /* vec_1 = vec_iv + (VF/n * S)
10402                  vec_2 = vec_1 + (VF/n * S)
10403                  ...
10404                  vec_n = vec_prev + (VF/n * S) = vec_iv + VF * S = vec_loop
10405
10406                  vec_n is used as vec_loop to save the large step register and
10407                  related operations.  */
10408               add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10409                            UNKNOWN_LOCATION);
10410             }
10411         }
10412     }
10413
10414   if (dump_enabled_p ())
10415     dump_printf_loc (MSG_NOTE, vect_location,
10416                      "transform induction: created def-use cycle: %G%G",
10417                      (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
10418
10419   return true;
10420 }
10421
10422 /* Function vectorizable_live_operation.
10423
10424    STMT_INFO computes a value that is used outside the loop.  Check if
10425    it can be supported.  */
10426
10427 bool
10428 vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
10429                              slp_tree slp_node, slp_instance slp_node_instance,
10430                              int slp_index, bool vec_stmt_p,
10431                              stmt_vector_for_cost *cost_vec)
10432 {
10433   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
10434   imm_use_iterator imm_iter;
10435   tree lhs, lhs_type, bitsize;
10436   tree vectype = (slp_node
10437                   ? SLP_TREE_VECTYPE (slp_node)
10438                   : STMT_VINFO_VECTYPE (stmt_info));
10439   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10440   int ncopies;
10441   gimple *use_stmt;
10442   auto_vec<tree> vec_oprnds;
10443   int vec_entry = 0;
10444   poly_uint64 vec_index = 0;
10445
10446   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
10447
10448   /* If a stmt of a reduction is live, vectorize it via
10449      vect_create_epilog_for_reduction.  vectorizable_reduction assessed
10450      validity so just trigger the transform here.  */
10451   if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
10452     {
10453       if (!vec_stmt_p)
10454         return true;
10455       if (slp_node)
10456         {
10457           /* For reduction chains the meta-info is attached to
10458              the group leader.  */
10459           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
10460             stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
10461           /* For SLP reductions we vectorize the epilogue for
10462              all involved stmts together.  */
10463           else if (slp_index != 0)
10464             return true;
10465         }
10466       stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
10467       gcc_assert (reduc_info->is_reduc_info);
10468       if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
10469           || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
10470         return true;
10471       vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
10472                                         slp_node_instance);
10473       return true;
10474     }
10475
10476   /* If STMT is not relevant and it is a simple assignment and its inputs are
10477      invariant then it can remain in place, unvectorized.  The original last
10478      scalar value that it computes will be used.  */
10479   if (!STMT_VINFO_RELEVANT_P (stmt_info))
10480     {
10481       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
10482       if (dump_enabled_p ())
10483         dump_printf_loc (MSG_NOTE, vect_location,
10484                          "statement is simple and uses invariant.  Leaving in "
10485                          "place.\n");
10486       return true;
10487     }
10488
10489   if (slp_node)
10490     ncopies = 1;
10491   else
10492     ncopies = vect_get_num_copies (loop_vinfo, vectype);
10493
10494   if (slp_node)
10495     {
10496       gcc_assert (slp_index >= 0);
10497
10498       /* Get the last occurrence of the scalar index from the concatenation of
10499          all the slp vectors. Calculate which slp vector it is and the index
10500          within.  */
10501       int num_scalar = SLP_TREE_LANES (slp_node);
10502       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10503       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
10504
10505       /* Calculate which vector contains the result, and which lane of
10506          that vector we need.  */
10507       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
10508         {
10509           if (dump_enabled_p ())
10510             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10511                              "Cannot determine which vector holds the"
10512                              " final result.\n");
10513           return false;
10514         }
10515     }
10516
10517   if (!vec_stmt_p)
10518     {
10519       /* No transformation required.  */
10520       if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
10521         {
10522           if (slp_node)
10523             {
10524               if (dump_enabled_p ())
10525                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10526                                  "can't operate on partial vectors "
10527                                  "because an SLP statement is live after "
10528                                  "the loop.\n");
10529               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10530             }
10531           else if (ncopies > 1)
10532             {
10533               if (dump_enabled_p ())
10534                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10535                                  "can't operate on partial vectors "
10536                                  "because ncopies is greater than 1.\n");
10537               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10538             }
10539           else
10540             {
10541               gcc_assert (ncopies == 1 && !slp_node);
10542               if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
10543                                                   OPTIMIZE_FOR_SPEED))
10544                 vect_record_loop_mask (loop_vinfo,
10545                                        &LOOP_VINFO_MASKS (loop_vinfo),
10546                                        1, vectype, NULL);
10547               else if (can_vec_extract_var_idx_p (
10548                          TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
10549                 vect_record_loop_len (loop_vinfo,
10550                                       &LOOP_VINFO_LENS (loop_vinfo),
10551                                       1, vectype, 1);
10552               else
10553                 {
10554                   if (dump_enabled_p ())
10555                     dump_printf_loc (
10556                       MSG_MISSED_OPTIMIZATION, vect_location,
10557                       "can't operate on partial vectors "
10558                       "because the target doesn't support extract "
10559                       "last reduction.\n");
10560                   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10561                 }
10562             }
10563         }
10564       /* ???  Enable for loop costing as well.  */
10565       if (!loop_vinfo)
10566         record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
10567                           0, vect_epilogue);
10568       return true;
10569     }
10570
10571   /* Use the lhs of the original scalar statement.  */
10572   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
10573   if (dump_enabled_p ())
10574     dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
10575                      "stmt %G", stmt);
10576
10577   lhs = gimple_get_lhs (stmt);
10578   lhs_type = TREE_TYPE (lhs);
10579
10580   bitsize = vector_element_bits_tree (vectype);
10581
10582   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
10583   tree vec_lhs, bitstart;
10584   gimple *vec_stmt;
10585   if (slp_node)
10586     {
10587       gcc_assert (!loop_vinfo
10588                   || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
10589                       && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)));
10590
10591       /* Get the correct slp vectorized stmt.  */
10592       vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
10593       vec_stmt = SSA_NAME_DEF_STMT (vec_lhs);
10594
10595       /* Get entry to use.  */
10596       bitstart = bitsize_int (vec_index);
10597       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
10598     }
10599   else
10600     {
10601       /* For multiple copies, get the last copy.  */
10602       vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
10603       vec_lhs = gimple_get_lhs (vec_stmt);
10604
10605       /* Get the last lane in the vector.  */
10606       bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
10607     }
10608
10609   if (loop_vinfo)
10610     {
10611       /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
10612          requirement, insert one phi node for it.  It looks like:
10613            loop;
10614          BB:
10615            # lhs' = PHI <lhs>
10616          ==>
10617            loop;
10618          BB:
10619            # vec_lhs' = PHI <vec_lhs>
10620            new_tree = lane_extract <vec_lhs', ...>;
10621            lhs' = new_tree;  */
10622
10623       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10624       basic_block exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
10625       gcc_assert (single_pred_p (exit_bb));
10626
10627       tree vec_lhs_phi = copy_ssa_name (vec_lhs);
10628       gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
10629       SET_PHI_ARG_DEF (phi, LOOP_VINFO_IV_EXIT (loop_vinfo)->dest_idx, vec_lhs);
10630
10631       gimple_seq stmts = NULL;
10632       tree new_tree;
10633       if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
10634         {
10635           /* Emit:
10636
10637                SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
10638
10639              where VEC_LHS is the vectorized live-out result and MASK is
10640              the loop mask for the final iteration.  */
10641           gcc_assert (ncopies == 1 && !slp_node);
10642           gimple_seq tem = NULL;
10643           gimple_stmt_iterator gsi = gsi_last (tem);
10644           tree len
10645             = vect_get_loop_len (loop_vinfo, &gsi,
10646                                  &LOOP_VINFO_LENS (loop_vinfo),
10647                                  1, vectype, 0, 0);
10648
10649           /* BIAS - 1.  */
10650           signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
10651           tree bias_minus_one
10652             = int_const_binop (MINUS_EXPR,
10653                                build_int_cst (TREE_TYPE (len), biasval),
10654                                build_one_cst (TREE_TYPE (len)));
10655
10656           /* LAST_INDEX = LEN + (BIAS - 1).  */
10657           tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
10658                                           len, bias_minus_one);
10659
10660           /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>.  */
10661           tree scalar_res
10662             = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
10663                             vec_lhs_phi, last_index);
10664
10665           /* Convert the extracted vector element to the scalar type.  */
10666           new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10667         }
10668       else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
10669         {
10670           /* Emit:
10671
10672                SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
10673
10674              where VEC_LHS is the vectorized live-out result and MASK is
10675              the loop mask for the final iteration.  */
10676           gcc_assert (ncopies == 1 && !slp_node);
10677           tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
10678           gimple_seq tem = NULL;
10679           gimple_stmt_iterator gsi = gsi_last (tem);
10680           tree mask = vect_get_loop_mask (loop_vinfo, &gsi,
10681                                           &LOOP_VINFO_MASKS (loop_vinfo),
10682                                           1, vectype, 0);
10683           gimple_seq_add_seq (&stmts, tem);
10684           tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
10685                                           mask, vec_lhs_phi);
10686
10687           /* Convert the extracted vector element to the scalar type.  */
10688           new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10689         }
10690       else
10691         {
10692           tree bftype = TREE_TYPE (vectype);
10693           if (VECTOR_BOOLEAN_TYPE_P (vectype))
10694             bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10695           new_tree = build3 (BIT_FIELD_REF, bftype,
10696                              vec_lhs_phi, bitsize, bitstart);
10697           new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10698                                            &stmts, true, NULL_TREE);
10699         }
10700
10701       if (stmts)
10702         {
10703           gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
10704           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
10705
10706           /* Remove existing phi from lhs and create one copy from new_tree.  */
10707           tree lhs_phi = NULL_TREE;
10708           gimple_stmt_iterator gsi;
10709           for (gsi = gsi_start_phis (exit_bb);
10710                !gsi_end_p (gsi); gsi_next (&gsi))
10711             {
10712               gimple *phi = gsi_stmt (gsi);
10713               if ((gimple_phi_arg_def (phi, 0) == lhs))
10714                 {
10715                   remove_phi_node (&gsi, false);
10716                   lhs_phi = gimple_phi_result (phi);
10717                   gimple *copy = gimple_build_assign (lhs_phi, new_tree);
10718                   gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
10719                   break;
10720                 }
10721             }
10722         }
10723
10724       /* Replace use of lhs with newly computed result.  If the use stmt is a
10725          single arg PHI, just replace all uses of PHI result.  It's necessary
10726          because lcssa PHI defining lhs may be before newly inserted stmt.  */
10727       use_operand_p use_p;
10728       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10729         if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
10730             && !is_gimple_debug (use_stmt))
10731           {
10732             if (gimple_code (use_stmt) == GIMPLE_PHI
10733                 && gimple_phi_num_args (use_stmt) == 1)
10734               {
10735                 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
10736               }
10737             else
10738               {
10739                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10740                     SET_USE (use_p, new_tree);
10741               }
10742             update_stmt (use_stmt);
10743           }
10744     }
10745   else
10746     {
10747       /* For basic-block vectorization simply insert the lane-extraction.  */
10748       tree bftype = TREE_TYPE (vectype);
10749       if (VECTOR_BOOLEAN_TYPE_P (vectype))
10750         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10751       tree new_tree = build3 (BIT_FIELD_REF, bftype,
10752                               vec_lhs, bitsize, bitstart);
10753       gimple_seq stmts = NULL;
10754       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10755                                        &stmts, true, NULL_TREE);
10756       if (TREE_CODE (new_tree) == SSA_NAME
10757           && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
10758         SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
10759       if (is_a <gphi *> (vec_stmt))
10760         {
10761           gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
10762           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10763         }
10764       else
10765         {
10766           gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
10767           gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
10768         }
10769
10770       /* Replace use of lhs with newly computed result.  If the use stmt is a
10771          single arg PHI, just replace all uses of PHI result.  It's necessary
10772          because lcssa PHI defining lhs may be before newly inserted stmt.  */
10773       use_operand_p use_p;
10774       stmt_vec_info use_stmt_info;
10775       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10776         if (!is_gimple_debug (use_stmt)
10777             && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
10778                 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
10779           {
10780             /* ???  This can happen when the live lane ends up being
10781                used in a vector construction code-generated by an
10782                external SLP node (and code-generation for that already
10783                happened).  See gcc.dg/vect/bb-slp-47.c.
10784                Doing this is what would happen if that vector CTOR
10785                were not code-generated yet so it is not too bad.
10786                ???  In fact we'd likely want to avoid this situation
10787                in the first place.  */
10788             if (TREE_CODE (new_tree) == SSA_NAME
10789                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10790                 && gimple_code (use_stmt) != GIMPLE_PHI
10791                 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
10792                                                 use_stmt))
10793               {
10794                 enum tree_code code = gimple_assign_rhs_code (use_stmt);
10795                 gcc_checking_assert (code == SSA_NAME
10796                                      || code == CONSTRUCTOR
10797                                      || code == VIEW_CONVERT_EXPR
10798                                      || CONVERT_EXPR_CODE_P (code));
10799                 if (dump_enabled_p ())
10800                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10801                                    "Using original scalar computation for "
10802                                    "live lane because use preceeds vector "
10803                                    "def\n");
10804                 continue;
10805               }
10806             /* ???  It can also happen that we end up pulling a def into
10807                a loop where replacing out-of-loop uses would require
10808                a new LC SSA PHI node.  Retain the original scalar in
10809                those cases as well.  PR98064.  */
10810             if (TREE_CODE (new_tree) == SSA_NAME
10811                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10812                 && (gimple_bb (use_stmt)->loop_father
10813                     != gimple_bb (vec_stmt)->loop_father)
10814                 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
10815                                         gimple_bb (use_stmt)->loop_father))
10816               {
10817                 if (dump_enabled_p ())
10818                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10819                                    "Using original scalar computation for "
10820                                    "live lane because there is an out-of-loop "
10821                                    "definition for it\n");
10822                 continue;
10823               }
10824             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10825               SET_USE (use_p, new_tree);
10826             update_stmt (use_stmt);
10827           }
10828     }
10829
10830   return true;
10831 }
10832
10833 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
10834
10835 static void
10836 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
10837 {
10838   ssa_op_iter op_iter;
10839   imm_use_iterator imm_iter;
10840   def_operand_p def_p;
10841   gimple *ustmt;
10842
10843   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
10844     {
10845       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
10846         {
10847           basic_block bb;
10848
10849           if (!is_gimple_debug (ustmt))
10850             continue;
10851
10852           bb = gimple_bb (ustmt);
10853
10854           if (!flow_bb_inside_loop_p (loop, bb))
10855             {
10856               if (gimple_debug_bind_p (ustmt))
10857                 {
10858                   if (dump_enabled_p ())
10859                     dump_printf_loc (MSG_NOTE, vect_location,
10860                                      "killing debug use\n");
10861
10862                   gimple_debug_bind_reset_value (ustmt);
10863                   update_stmt (ustmt);
10864                 }
10865               else
10866                 gcc_unreachable ();
10867             }
10868         }
10869     }
10870 }
10871
10872 /* Given loop represented by LOOP_VINFO, return true if computation of
10873    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
10874    otherwise.  */
10875
10876 static bool
10877 loop_niters_no_overflow (loop_vec_info loop_vinfo)
10878 {
10879   /* Constant case.  */
10880   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
10881     {
10882       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
10883       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
10884
10885       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
10886       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
10887       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
10888         return true;
10889     }
10890
10891   widest_int max;
10892   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10893   /* Check the upper bound of loop niters.  */
10894   if (get_max_loop_iterations (loop, &max))
10895     {
10896       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
10897       signop sgn = TYPE_SIGN (type);
10898       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
10899       if (max < type_max)
10900         return true;
10901     }
10902   return false;
10903 }
10904
10905 /* Return a mask type with half the number of elements as OLD_TYPE,
10906    given that it should have mode NEW_MODE.  */
10907
10908 tree
10909 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
10910 {
10911   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
10912   return build_truth_vector_type_for_mode (nunits, new_mode);
10913 }
10914
10915 /* Return a mask type with twice as many elements as OLD_TYPE,
10916    given that it should have mode NEW_MODE.  */
10917
10918 tree
10919 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
10920 {
10921   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
10922   return build_truth_vector_type_for_mode (nunits, new_mode);
10923 }
10924
10925 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
10926    contain a sequence of NVECTORS masks that each control a vector of type
10927    VECTYPE.  If SCALAR_MASK is nonnull, the fully-masked loop would AND
10928    these vector masks with the vector version of SCALAR_MASK.  */
10929
10930 void
10931 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
10932                        unsigned int nvectors, tree vectype, tree scalar_mask)
10933 {
10934   gcc_assert (nvectors != 0);
10935
10936   if (scalar_mask)
10937     {
10938       scalar_cond_masked_key cond (scalar_mask, nvectors);
10939       loop_vinfo->scalar_cond_masked_set.add (cond);
10940     }
10941
10942   masks->mask_set.add (std::make_pair (vectype, nvectors));
10943 }
10944
10945 /* Given a complete set of masks MASKS, extract mask number INDEX
10946    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
10947    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
10948
10949    See the comment above vec_loop_masks for more details about the mask
10950    arrangement.  */
10951
10952 tree
10953 vect_get_loop_mask (loop_vec_info loop_vinfo,
10954                     gimple_stmt_iterator *gsi, vec_loop_masks *masks,
10955                     unsigned int nvectors, tree vectype, unsigned int index)
10956 {
10957   if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
10958       == vect_partial_vectors_while_ult)
10959     {
10960       rgroup_controls *rgm = &(masks->rgc_vec)[nvectors - 1];
10961       tree mask_type = rgm->type;
10962
10963       /* Populate the rgroup's mask array, if this is the first time we've
10964          used it.  */
10965       if (rgm->controls.is_empty ())
10966         {
10967           rgm->controls.safe_grow_cleared (nvectors, true);
10968           for (unsigned int i = 0; i < nvectors; ++i)
10969             {
10970               tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
10971               /* Provide a dummy definition until the real one is available.  */
10972               SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
10973               rgm->controls[i] = mask;
10974             }
10975         }
10976
10977       tree mask = rgm->controls[index];
10978       if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
10979                     TYPE_VECTOR_SUBPARTS (vectype)))
10980         {
10981           /* A loop mask for data type X can be reused for data type Y
10982              if X has N times more elements than Y and if Y's elements
10983              are N times bigger than X's.  In this case each sequence
10984              of N elements in the loop mask will be all-zero or all-one.
10985              We can then view-convert the mask so that each sequence of
10986              N elements is replaced by a single element.  */
10987           gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
10988                                   TYPE_VECTOR_SUBPARTS (vectype)));
10989           gimple_seq seq = NULL;
10990           mask_type = truth_type_for (vectype);
10991           mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
10992           if (seq)
10993             gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10994         }
10995       return mask;
10996     }
10997   else if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
10998            == vect_partial_vectors_avx512)
10999     {
11000       /* The number of scalars per iteration and the number of vectors are
11001          both compile-time constants.  */
11002       unsigned int nscalars_per_iter
11003         = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11004                      LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
11005
11006       rgroup_controls *rgm = &masks->rgc_vec[nscalars_per_iter - 1];
11007
11008       /* The stored nV is dependent on the mask type produced.  */
11009       gcc_assert (exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11010                              TYPE_VECTOR_SUBPARTS (rgm->type)).to_constant ()
11011                   == rgm->factor);
11012       nvectors = rgm->factor;
11013
11014       /* Populate the rgroup's mask array, if this is the first time we've
11015          used it.  */
11016       if (rgm->controls.is_empty ())
11017         {
11018           rgm->controls.safe_grow_cleared (nvectors, true);
11019           for (unsigned int i = 0; i < nvectors; ++i)
11020             {
11021               tree mask = make_temp_ssa_name (rgm->type, NULL, "loop_mask");
11022               /* Provide a dummy definition until the real one is available.  */
11023               SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
11024               rgm->controls[i] = mask;
11025             }
11026         }
11027       if (known_eq (TYPE_VECTOR_SUBPARTS (rgm->type),
11028                     TYPE_VECTOR_SUBPARTS (vectype)))
11029         return rgm->controls[index];
11030
11031       /* Split the vector if needed.  Since we are dealing with integer mode
11032          masks with AVX512 we can operate on the integer representation
11033          performing the whole vector shifting.  */
11034       unsigned HOST_WIDE_INT factor;
11035       bool ok = constant_multiple_p (TYPE_VECTOR_SUBPARTS (rgm->type),
11036                                      TYPE_VECTOR_SUBPARTS (vectype), &factor);
11037       gcc_assert (ok);
11038       gcc_assert (GET_MODE_CLASS (TYPE_MODE (rgm->type)) == MODE_INT);
11039       tree mask_type = truth_type_for (vectype);
11040       gcc_assert (GET_MODE_CLASS (TYPE_MODE (mask_type)) == MODE_INT);
11041       unsigned vi = index / factor;
11042       unsigned vpart = index % factor;
11043       tree vec = rgm->controls[vi];
11044       gimple_seq seq = NULL;
11045       vec = gimple_build (&seq, VIEW_CONVERT_EXPR,
11046                           lang_hooks.types.type_for_mode
11047                                 (TYPE_MODE (rgm->type), 1), vec);
11048       /* For integer mode masks simply shift the right bits into position.  */
11049       if (vpart != 0)
11050         vec = gimple_build (&seq, RSHIFT_EXPR, TREE_TYPE (vec), vec,
11051                             build_int_cst (integer_type_node,
11052                                            (TYPE_VECTOR_SUBPARTS (vectype)
11053                                             * vpart)));
11054       vec = gimple_convert (&seq, lang_hooks.types.type_for_mode
11055                                     (TYPE_MODE (mask_type), 1), vec);
11056       vec = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, vec);
11057       if (seq)
11058         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11059       return vec;
11060     }
11061   else
11062     gcc_unreachable ();
11063 }
11064
11065 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
11066    lengths for controlling an operation on VECTYPE.  The operation splits
11067    each element of VECTYPE into FACTOR separate subelements, measuring the
11068    length as a number of these subelements.  */
11069
11070 void
11071 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
11072                       unsigned int nvectors, tree vectype, unsigned int factor)
11073 {
11074   gcc_assert (nvectors != 0);
11075   if (lens->length () < nvectors)
11076     lens->safe_grow_cleared (nvectors, true);
11077   rgroup_controls *rgl = &(*lens)[nvectors - 1];
11078
11079   /* The number of scalars per iteration, scalar occupied bytes and
11080      the number of vectors are both compile-time constants.  */
11081   unsigned int nscalars_per_iter
11082     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11083                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
11084
11085   if (rgl->max_nscalars_per_iter < nscalars_per_iter)
11086     {
11087       /* For now, we only support cases in which all loads and stores fall back
11088          to VnQI or none do.  */
11089       gcc_assert (!rgl->max_nscalars_per_iter
11090                   || (rgl->factor == 1 && factor == 1)
11091                   || (rgl->max_nscalars_per_iter * rgl->factor
11092                       == nscalars_per_iter * factor));
11093       rgl->max_nscalars_per_iter = nscalars_per_iter;
11094       rgl->type = vectype;
11095       rgl->factor = factor;
11096     }
11097 }
11098
11099 /* Given a complete set of lengths LENS, extract length number INDEX
11100    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
11101    where 0 <= INDEX < NVECTORS.  Return a value that contains FACTOR
11102    multipled by the number of elements that should be processed.
11103    Insert any set-up statements before GSI.  */
11104
11105 tree
11106 vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
11107                    vec_loop_lens *lens, unsigned int nvectors, tree vectype,
11108                    unsigned int index, unsigned int factor)
11109 {
11110   rgroup_controls *rgl = &(*lens)[nvectors - 1];
11111   bool use_bias_adjusted_len =
11112     LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
11113
11114   /* Populate the rgroup's len array, if this is the first time we've
11115      used it.  */
11116   if (rgl->controls.is_empty ())
11117     {
11118       rgl->controls.safe_grow_cleared (nvectors, true);
11119       for (unsigned int i = 0; i < nvectors; ++i)
11120         {
11121           tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
11122           gcc_assert (len_type != NULL_TREE);
11123
11124           tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
11125
11126           /* Provide a dummy definition until the real one is available.  */
11127           SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
11128           rgl->controls[i] = len;
11129
11130           if (use_bias_adjusted_len)
11131             {
11132               gcc_assert (i == 0);
11133               tree adjusted_len =
11134                 make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
11135               SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
11136               rgl->bias_adjusted_ctrl = adjusted_len;
11137             }
11138         }
11139     }
11140
11141   if (use_bias_adjusted_len)
11142     return rgl->bias_adjusted_ctrl;
11143
11144   tree loop_len = rgl->controls[index];
11145   if (rgl->factor == 1 && factor == 1)
11146     {
11147       poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
11148       poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
11149       if (maybe_ne (nunits1, nunits2))
11150         {
11151           /* A loop len for data type X can be reused for data type Y
11152              if X has N times more elements than Y and if Y's elements
11153              are N times bigger than X's.  */
11154           gcc_assert (multiple_p (nunits1, nunits2));
11155           factor = exact_div (nunits1, nunits2).to_constant ();
11156           tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
11157           gimple_seq seq = NULL;
11158           loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
11159                                    build_int_cst (iv_type, factor));
11160           if (seq)
11161             gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11162         }
11163     }
11164   return loop_len;
11165 }
11166
11167 /* Scale profiling counters by estimation for LOOP which is vectorized
11168    by factor VF.
11169    If FLAT is true, the loop we started with had unrealistically flat
11170    profile.  */
11171
11172 static void
11173 scale_profile_for_vect_loop (class loop *loop, edge exit_e, unsigned vf, bool flat)
11174 {
11175   /* For flat profiles do not scale down proportionally by VF and only
11176      cap by known iteration count bounds.  */
11177   if (flat)
11178     {
11179       if (dump_file && (dump_flags & TDF_DETAILS))
11180         fprintf (dump_file,
11181                  "Vectorized loop profile seems flat; not scaling iteration "
11182                  "count down by the vectorization factor %i\n", vf);
11183       scale_loop_profile (loop, profile_probability::always (),
11184                           get_likely_max_loop_iterations_int (loop));
11185       return;
11186     }
11187   /* Loop body executes VF fewer times and exit increases VF times.  */
11188   profile_count entry_count = loop_preheader_edge (loop)->count ();
11189
11190   /* If we have unreliable loop profile avoid dropping entry
11191      count bellow header count.  This can happen since loops
11192      has unrealistically low trip counts.  */
11193   while (vf > 1
11194          && loop->header->count > entry_count
11195          && loop->header->count < entry_count * vf)
11196     {
11197       if (dump_file && (dump_flags & TDF_DETAILS))
11198         fprintf (dump_file,
11199                  "Vectorization factor %i seems too large for profile "
11200                  "prevoiusly believed to be consistent; reducing.\n", vf);
11201       vf /= 2;
11202     }
11203
11204   if (entry_count.nonzero_p ())
11205     set_edge_probability_and_rescale_others
11206             (exit_e,
11207              entry_count.probability_in (loop->header->count / vf));
11208   /* Avoid producing very large exit probability when we do not have
11209      sensible profile.  */
11210   else if (exit_e->probability < profile_probability::always () / (vf * 2))
11211     set_edge_probability_and_rescale_others (exit_e, exit_e->probability * vf);
11212   loop->latch->count = single_pred_edge (loop->latch)->count ();
11213
11214   scale_loop_profile (loop, profile_probability::always () / vf,
11215                       get_likely_max_loop_iterations_int (loop));
11216 }
11217
11218 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
11219    latch edge values originally defined by it.  */
11220
11221 static void
11222 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
11223                                      stmt_vec_info def_stmt_info)
11224 {
11225   tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
11226   if (!def || TREE_CODE (def) != SSA_NAME)
11227     return;
11228   stmt_vec_info phi_info;
11229   imm_use_iterator iter;
11230   use_operand_p use_p;
11231   FOR_EACH_IMM_USE_FAST (use_p, iter, def)
11232     {
11233       gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p));
11234       if (!phi)
11235         continue;
11236       if (!(gimple_bb (phi)->loop_father->header == gimple_bb (phi)
11237             && (phi_info = loop_vinfo->lookup_stmt (phi))
11238             && STMT_VINFO_RELEVANT_P (phi_info)))
11239         continue;
11240       loop_p loop = gimple_bb (phi)->loop_father;
11241       edge e = loop_latch_edge (loop);
11242       if (PHI_ARG_DEF_FROM_EDGE (phi, e) != def)
11243         continue;
11244
11245       if (VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
11246           && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
11247           && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
11248         {
11249           vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11250           vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11251           gcc_assert (phi_defs.length () == latch_defs.length ());
11252           for (unsigned i = 0; i < phi_defs.length (); ++i)
11253             add_phi_arg (as_a <gphi *> (phi_defs[i]),
11254                          gimple_get_lhs (latch_defs[i]), e,
11255                          gimple_phi_arg_location (phi, e->dest_idx));
11256         }
11257       else if (STMT_VINFO_DEF_TYPE (phi_info) == vect_first_order_recurrence)
11258         {
11259           /* For first order recurrences we have to update both uses of
11260              the latch definition, the one in the PHI node and the one
11261              in the generated VEC_PERM_EXPR.  */
11262           vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11263           vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11264           gcc_assert (phi_defs.length () == latch_defs.length ());
11265           tree phidef = gimple_assign_rhs1 (phi_defs[0]);
11266           gphi *vphi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
11267           for (unsigned i = 0; i < phi_defs.length (); ++i)
11268             {
11269               gassign *perm = as_a <gassign *> (phi_defs[i]);
11270               if (i > 0)
11271                 gimple_assign_set_rhs1 (perm, gimple_get_lhs (latch_defs[i-1]));
11272               gimple_assign_set_rhs2 (perm, gimple_get_lhs (latch_defs[i]));
11273               update_stmt (perm);
11274             }
11275           add_phi_arg (vphi, gimple_get_lhs (latch_defs.last ()), e,
11276                        gimple_phi_arg_location (phi, e->dest_idx));
11277         }
11278     }
11279 }
11280
11281 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
11282    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
11283    stmt_vec_info.  */
11284
11285 static bool
11286 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
11287                           gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
11288 {
11289   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11290   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11291
11292   if (dump_enabled_p ())
11293     dump_printf_loc (MSG_NOTE, vect_location,
11294                      "------>vectorizing statement: %G", stmt_info->stmt);
11295
11296   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
11297     vect_loop_kill_debug_uses (loop, stmt_info);
11298
11299   if (!STMT_VINFO_RELEVANT_P (stmt_info)
11300       && !STMT_VINFO_LIVE_P (stmt_info))
11301     return false;
11302
11303   if (STMT_VINFO_VECTYPE (stmt_info))
11304     {
11305       poly_uint64 nunits
11306         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
11307       if (!STMT_SLP_TYPE (stmt_info)
11308           && maybe_ne (nunits, vf)
11309           && dump_enabled_p ())
11310         /* For SLP VF is set according to unrolling factor, and not
11311            to vector size, hence for SLP this print is not valid.  */
11312         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
11313     }
11314
11315   /* Pure SLP statements have already been vectorized.  We still need
11316      to apply loop vectorization to hybrid SLP statements.  */
11317   if (PURE_SLP_STMT (stmt_info))
11318     return false;
11319
11320   if (dump_enabled_p ())
11321     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
11322
11323   if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
11324     *seen_store = stmt_info;
11325
11326   return true;
11327 }
11328
11329 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
11330    in the hash_map with its corresponding values.  */
11331
11332 static tree
11333 find_in_mapping (tree t, void *context)
11334 {
11335   hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
11336
11337   tree *value = mapping->get (t);
11338   return value ? *value : t;
11339 }
11340
11341 /* Update EPILOGUE's loop_vec_info.  EPILOGUE was constructed as a copy of the
11342    original loop that has now been vectorized.
11343
11344    The inits of the data_references need to be advanced with the number of
11345    iterations of the main loop.  This has been computed in vect_do_peeling and
11346    is stored in parameter ADVANCE.  We first restore the data_references
11347    initial offset with the values recored in ORIG_DRS_INIT.
11348
11349    Since the loop_vec_info of this EPILOGUE was constructed for the original
11350    loop, its stmt_vec_infos all point to the original statements.  These need
11351    to be updated to point to their corresponding copies as well as the SSA_NAMES
11352    in their PATTERN_DEF_SEQs and RELATED_STMTs.
11353
11354    The data_reference's connections also need to be updated.  Their
11355    corresponding dr_vec_info need to be reconnected to the EPILOGUE's
11356    stmt_vec_infos, their statements need to point to their corresponding copy,
11357    if they are gather loads or scatter stores then their reference needs to be
11358    updated to point to its corresponding copy and finally we set
11359    'base_misaligned' to false as we have already peeled for alignment in the
11360    prologue of the main loop.  */
11361
11362 static void
11363 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
11364 {
11365   loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
11366   auto_vec<gimple *> stmt_worklist;
11367   hash_map<tree,tree> mapping;
11368   gimple *orig_stmt, *new_stmt;
11369   gimple_stmt_iterator epilogue_gsi;
11370   gphi_iterator epilogue_phi_gsi;
11371   stmt_vec_info stmt_vinfo = NULL, related_vinfo;
11372   basic_block *epilogue_bbs = get_loop_body (epilogue);
11373   unsigned i;
11374
11375   free (LOOP_VINFO_BBS (epilogue_vinfo));
11376   LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
11377
11378   /* Advance data_reference's with the number of iterations of the previous
11379      loop and its prologue.  */
11380   vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
11381
11382
11383   /* The EPILOGUE loop is a copy of the original loop so they share the same
11384      gimple UIDs.  In this loop we update the loop_vec_info of the EPILOGUE to
11385      point to the copied statements.  We also create a mapping of all LHS' in
11386      the original loop and all the LHS' in the EPILOGUE and create worklists to
11387      update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs.  */
11388   for (unsigned i = 0; i < epilogue->num_nodes; ++i)
11389     {
11390       for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
11391            !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
11392         {
11393           new_stmt = epilogue_phi_gsi.phi ();
11394
11395           gcc_assert (gimple_uid (new_stmt) > 0);
11396           stmt_vinfo
11397             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11398
11399           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11400           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11401
11402           mapping.put (gimple_phi_result (orig_stmt),
11403                        gimple_phi_result (new_stmt));
11404           /* PHI nodes can not have patterns or related statements.  */
11405           gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
11406                       && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
11407         }
11408
11409       for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
11410            !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
11411         {
11412           new_stmt = gsi_stmt (epilogue_gsi);
11413           if (is_gimple_debug (new_stmt))
11414             continue;
11415
11416           gcc_assert (gimple_uid (new_stmt) > 0);
11417           stmt_vinfo
11418             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11419
11420           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11421           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11422
11423           if (tree old_lhs = gimple_get_lhs (orig_stmt))
11424             mapping.put (old_lhs, gimple_get_lhs (new_stmt));
11425
11426           if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
11427             {
11428               gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
11429               for (gimple_stmt_iterator gsi = gsi_start (seq);
11430                    !gsi_end_p (gsi); gsi_next (&gsi))
11431                 stmt_worklist.safe_push (gsi_stmt (gsi));
11432             }
11433
11434           related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
11435           if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
11436             {
11437               gimple *stmt = STMT_VINFO_STMT (related_vinfo);
11438               stmt_worklist.safe_push (stmt);
11439               /* Set BB such that the assert in
11440                 'get_initial_def_for_reduction' is able to determine that
11441                 the BB of the related stmt is inside this loop.  */
11442               gimple_set_bb (stmt,
11443                              gimple_bb (new_stmt));
11444               related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
11445               gcc_assert (related_vinfo == NULL
11446                           || related_vinfo == stmt_vinfo);
11447             }
11448         }
11449     }
11450
11451   /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
11452      using the original main loop and thus need to be updated to refer to the
11453      cloned variables used in the epilogue.  */
11454   for (unsigned i = 0; i < stmt_worklist.length (); ++i)
11455     {
11456       gimple *stmt = stmt_worklist[i];
11457       tree *new_op;
11458
11459       for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
11460         {
11461           tree op = gimple_op (stmt, j);
11462           if ((new_op = mapping.get(op)))
11463             gimple_set_op (stmt, j, *new_op);
11464           else
11465             {
11466               /* PR92429: The last argument of simplify_replace_tree disables
11467                  folding when replacing arguments.  This is required as
11468                  otherwise you might end up with different statements than the
11469                  ones analyzed in vect_loop_analyze, leading to different
11470                  vectorization.  */
11471               op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
11472                                           &find_in_mapping, &mapping, false);
11473               gimple_set_op (stmt, j, op);
11474             }
11475         }
11476     }
11477
11478   struct data_reference *dr;
11479   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
11480   FOR_EACH_VEC_ELT (datarefs, i, dr)
11481     {
11482       orig_stmt = DR_STMT (dr);
11483       gcc_assert (gimple_uid (orig_stmt) > 0);
11484       stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
11485       /* Data references for gather loads and scatter stores do not use the
11486          updated offset we set using ADVANCE.  Instead we have to make sure the
11487          reference in the data references point to the corresponding copy of
11488          the original in the epilogue.  Make sure to update both
11489          gather/scatters recognized by dataref analysis and also other
11490          refs that get_load_store_type classified as VMAT_GATHER_SCATTER.  */
11491       auto vstmt_vinfo = vect_stmt_to_vectorize (stmt_vinfo);
11492       if (STMT_VINFO_MEMORY_ACCESS_TYPE (vstmt_vinfo) == VMAT_GATHER_SCATTER
11493           || STMT_VINFO_GATHER_SCATTER_P (vstmt_vinfo))
11494         {
11495           DR_REF (dr)
11496             = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
11497                                      &find_in_mapping, &mapping);
11498           DR_BASE_ADDRESS (dr)
11499             = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
11500                                      &find_in_mapping, &mapping);
11501         }
11502       DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
11503       stmt_vinfo->dr_aux.stmt = stmt_vinfo;
11504       /* The vector size of the epilogue is smaller than that of the main loop
11505          so the alignment is either the same or lower. This means the dr will
11506          thus by definition be aligned.  */
11507       STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
11508     }
11509
11510   epilogue_vinfo->shared->datarefs_copy.release ();
11511   epilogue_vinfo->shared->save_datarefs ();
11512 }
11513
11514 /* Function vect_transform_loop.
11515
11516    The analysis phase has determined that the loop is vectorizable.
11517    Vectorize the loop - created vectorized stmts to replace the scalar
11518    stmts in the loop, and update the loop exit condition.
11519    Returns scalar epilogue loop if any.  */
11520
11521 class loop *
11522 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
11523 {
11524   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11525   class loop *epilogue = NULL;
11526   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
11527   int nbbs = loop->num_nodes;
11528   int i;
11529   tree niters_vector = NULL_TREE;
11530   tree step_vector = NULL_TREE;
11531   tree niters_vector_mult_vf = NULL_TREE;
11532   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11533   unsigned int lowest_vf = constant_lower_bound (vf);
11534   gimple *stmt;
11535   bool check_profitability = false;
11536   unsigned int th;
11537   bool flat = maybe_flat_loop_profile (loop);
11538
11539   DUMP_VECT_SCOPE ("vec_transform_loop");
11540
11541   loop_vinfo->shared->check_datarefs ();
11542
11543   /* Use the more conservative vectorization threshold.  If the number
11544      of iterations is constant assume the cost check has been performed
11545      by our caller.  If the threshold makes all loops profitable that
11546      run at least the (estimated) vectorization factor number of times
11547      checking is pointless, too.  */
11548   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
11549   if (vect_apply_runtime_profitability_check_p (loop_vinfo))
11550     {
11551       if (dump_enabled_p ())
11552         dump_printf_loc (MSG_NOTE, vect_location,
11553                          "Profitability threshold is %d loop iterations.\n",
11554                          th);
11555       check_profitability = true;
11556     }
11557
11558   /* Make sure there exists a single-predecessor exit bb.  Do this before
11559      versioning.   */
11560   edge e = LOOP_VINFO_IV_EXIT (loop_vinfo);
11561   if (! single_pred_p (e->dest))
11562     {
11563       split_loop_exit_edge (e, true);
11564       if (dump_enabled_p ())
11565         dump_printf (MSG_NOTE, "split exit edge\n");
11566     }
11567
11568   /* Version the loop first, if required, so the profitability check
11569      comes first.  */
11570
11571   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
11572     {
11573       class loop *sloop
11574         = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
11575       sloop->force_vectorize = false;
11576       check_profitability = false;
11577     }
11578
11579   /* Make sure there exists a single-predecessor exit bb also on the
11580      scalar loop copy.  Do this after versioning but before peeling
11581      so CFG structure is fine for both scalar and if-converted loop
11582      to make slpeel_duplicate_current_defs_from_edges face matched
11583      loop closed PHI nodes on the exit.  */
11584   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
11585     {
11586       e = LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo);
11587       if (! single_pred_p (e->dest))
11588         {
11589           split_loop_exit_edge (e, true);
11590           if (dump_enabled_p ())
11591             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
11592         }
11593     }
11594
11595   tree niters = vect_build_loop_niters (loop_vinfo);
11596   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
11597   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
11598   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
11599   tree advance;
11600   drs_init_vec orig_drs_init;
11601
11602   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
11603                               &step_vector, &niters_vector_mult_vf, th,
11604                               check_profitability, niters_no_overflow,
11605                               &advance);
11606   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
11607       && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
11608     {
11609       /* Ifcvt duplicates loop preheader, loop body and produces an basic
11610          block after loop exit.  We need to scale all that.  */
11611       basic_block preheader
11612         = loop_preheader_edge (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->src;
11613       preheader->count
11614         = preheader->count.apply_probability
11615               (LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11616       scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
11617                               LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11618       single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->dest->count
11619         = preheader->count;
11620     }
11621
11622   if (niters_vector == NULL_TREE)
11623     {
11624       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
11625           && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
11626           && known_eq (lowest_vf, vf))
11627         {
11628           niters_vector
11629             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
11630                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
11631           step_vector = build_one_cst (TREE_TYPE (niters));
11632         }
11633       else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11634         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
11635                                      &step_vector, niters_no_overflow);
11636       else
11637         /* vect_do_peeling subtracted the number of peeled prologue
11638            iterations from LOOP_VINFO_NITERS.  */
11639         vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
11640                                      &niters_vector, &step_vector,
11641                                      niters_no_overflow);
11642     }
11643
11644   /* 1) Make sure the loop header has exactly two entries
11645      2) Make sure we have a preheader basic block.  */
11646
11647   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
11648
11649   split_edge (loop_preheader_edge (loop));
11650
11651   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11652     /* This will deal with any possible peeling.  */
11653     vect_prepare_for_masked_peels (loop_vinfo);
11654
11655   /* Schedule the SLP instances first, then handle loop vectorization
11656      below.  */
11657   if (!loop_vinfo->slp_instances.is_empty ())
11658     {
11659       DUMP_VECT_SCOPE ("scheduling SLP instances");
11660       vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
11661     }
11662
11663   /* FORNOW: the vectorizer supports only loops which body consist
11664      of one basic block (header + empty latch). When the vectorizer will
11665      support more involved loop forms, the order by which the BBs are
11666      traversed need to be reconsidered.  */
11667
11668   for (i = 0; i < nbbs; i++)
11669     {
11670       basic_block bb = bbs[i];
11671       stmt_vec_info stmt_info;
11672
11673       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
11674            gsi_next (&si))
11675         {
11676           gphi *phi = si.phi ();
11677           if (dump_enabled_p ())
11678             dump_printf_loc (MSG_NOTE, vect_location,
11679                              "------>vectorizing phi: %G", (gimple *) phi);
11680           stmt_info = loop_vinfo->lookup_stmt (phi);
11681           if (!stmt_info)
11682             continue;
11683
11684           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
11685             vect_loop_kill_debug_uses (loop, stmt_info);
11686
11687           if (!STMT_VINFO_RELEVANT_P (stmt_info)
11688               && !STMT_VINFO_LIVE_P (stmt_info))
11689             continue;
11690
11691           if (STMT_VINFO_VECTYPE (stmt_info)
11692               && (maybe_ne
11693                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
11694               && dump_enabled_p ())
11695             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
11696
11697           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
11698                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
11699                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
11700                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
11701                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence
11702                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
11703               && ! PURE_SLP_STMT (stmt_info))
11704             {
11705               if (dump_enabled_p ())
11706                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
11707               vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
11708             }
11709         }
11710
11711       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
11712            gsi_next (&si))
11713         {
11714           gphi *phi = si.phi ();
11715           stmt_info = loop_vinfo->lookup_stmt (phi);
11716           if (!stmt_info)
11717             continue;
11718
11719           if (!STMT_VINFO_RELEVANT_P (stmt_info)
11720               && !STMT_VINFO_LIVE_P (stmt_info))
11721             continue;
11722
11723           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
11724                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
11725                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
11726                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
11727                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
11728                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence)
11729               && ! PURE_SLP_STMT (stmt_info))
11730             maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
11731         }
11732
11733       for (gimple_stmt_iterator si = gsi_start_bb (bb);
11734            !gsi_end_p (si);)
11735         {
11736           stmt = gsi_stmt (si);
11737           /* During vectorization remove existing clobber stmts.  */
11738           if (gimple_clobber_p (stmt))
11739             {
11740               unlink_stmt_vdef (stmt);
11741               gsi_remove (&si, true);
11742               release_defs (stmt);
11743             }
11744           else
11745             {
11746               /* Ignore vector stmts created in the outer loop.  */
11747               stmt_info = loop_vinfo->lookup_stmt (stmt);
11748
11749               /* vector stmts created in the outer-loop during vectorization of
11750                  stmts in an inner-loop may not have a stmt_info, and do not
11751                  need to be vectorized.  */
11752               stmt_vec_info seen_store = NULL;
11753               if (stmt_info)
11754                 {
11755                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
11756                     {
11757                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
11758                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
11759                            !gsi_end_p (subsi); gsi_next (&subsi))
11760                         {
11761                           stmt_vec_info pat_stmt_info
11762                             = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
11763                           vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
11764                                                     &si, &seen_store);
11765                         }
11766                       stmt_vec_info pat_stmt_info
11767                         = STMT_VINFO_RELATED_STMT (stmt_info);
11768                       if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
11769                                                     &si, &seen_store))
11770                         maybe_set_vectorized_backedge_value (loop_vinfo,
11771                                                              pat_stmt_info);
11772                     }
11773                   else
11774                     {
11775                       if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
11776                                                     &seen_store))
11777                         maybe_set_vectorized_backedge_value (loop_vinfo,
11778                                                              stmt_info);
11779                     }
11780                 }
11781               gsi_next (&si);
11782               if (seen_store)
11783                 {
11784                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
11785                     /* Interleaving.  If IS_STORE is TRUE, the
11786                        vectorization of the interleaving chain was
11787                        completed - free all the stores in the chain.  */
11788                     vect_remove_stores (loop_vinfo,
11789                                         DR_GROUP_FIRST_ELEMENT (seen_store));
11790                   else
11791                     /* Free the attached stmt_vec_info and remove the stmt.  */
11792                     loop_vinfo->remove_stmt (stmt_info);
11793                 }
11794             }
11795         }
11796
11797       /* Stub out scalar statements that must not survive vectorization.
11798          Doing this here helps with grouped statements, or statements that
11799          are involved in patterns.  */
11800       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
11801            !gsi_end_p (gsi); gsi_next (&gsi))
11802         {
11803           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
11804           if (!call || !gimple_call_internal_p (call))
11805             continue;
11806           internal_fn ifn = gimple_call_internal_fn (call);
11807           if (ifn == IFN_MASK_LOAD)
11808             {
11809               tree lhs = gimple_get_lhs (call);
11810               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11811                 {
11812                   tree zero = build_zero_cst (TREE_TYPE (lhs));
11813                   gimple *new_stmt = gimple_build_assign (lhs, zero);
11814                   gsi_replace (&gsi, new_stmt, true);
11815                 }
11816             }
11817           else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
11818             {
11819               tree lhs = gimple_get_lhs (call);
11820               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11821                 {
11822                   tree else_arg
11823                     = gimple_call_arg (call, gimple_call_num_args (call) - 1);
11824                   gimple *new_stmt = gimple_build_assign (lhs, else_arg);
11825                   gsi_replace (&gsi, new_stmt, true);
11826                 }
11827             }
11828         }
11829     }                           /* BBs in loop */
11830
11831   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
11832      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
11833   if (integer_onep (step_vector))
11834     niters_no_overflow = true;
11835   vect_set_loop_condition (loop, LOOP_VINFO_IV_EXIT (loop_vinfo), loop_vinfo,
11836                            niters_vector, step_vector, niters_vector_mult_vf,
11837                            !niters_no_overflow);
11838
11839   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
11840
11841   /* True if the final iteration might not handle a full vector's
11842      worth of scalar iterations.  */
11843   bool final_iter_may_be_partial
11844     = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
11845   /* The minimum number of iterations performed by the epilogue.  This
11846      is 1 when peeling for gaps because we always need a final scalar
11847      iteration.  */
11848   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
11849   /* +1 to convert latch counts to loop iteration counts,
11850      -min_epilogue_iters to remove iterations that cannot be performed
11851        by the vector code.  */
11852   int bias_for_lowest = 1 - min_epilogue_iters;
11853   int bias_for_assumed = bias_for_lowest;
11854   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
11855   if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
11856     {
11857       /* When the amount of peeling is known at compile time, the first
11858          iteration will have exactly alignment_npeels active elements.
11859          In the worst case it will have at least one.  */
11860       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
11861       bias_for_lowest += lowest_vf - min_first_active;
11862       bias_for_assumed += assumed_vf - min_first_active;
11863     }
11864   /* In these calculations the "- 1" converts loop iteration counts
11865      back to latch counts.  */
11866   if (loop->any_upper_bound)
11867     {
11868       loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
11869       loop->nb_iterations_upper_bound
11870         = (final_iter_may_be_partial
11871            ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
11872                             lowest_vf) - 1
11873            : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
11874                              lowest_vf) - 1);
11875       if (main_vinfo
11876           /* Both peeling for alignment and peeling for gaps can end up
11877              with the scalar epilogue running for more than VF-1 iterations.  */
11878           && !main_vinfo->peeling_for_alignment
11879           && !main_vinfo->peeling_for_gaps)
11880         {
11881           unsigned int bound;
11882           poly_uint64 main_iters
11883             = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
11884                            LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
11885           main_iters
11886             = upper_bound (main_iters,
11887                            LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
11888           if (can_div_away_from_zero_p (main_iters,
11889                                         LOOP_VINFO_VECT_FACTOR (loop_vinfo),
11890                                         &bound))
11891             loop->nb_iterations_upper_bound
11892               = wi::umin ((bound_wide_int) (bound - 1),
11893                           loop->nb_iterations_upper_bound);
11894       }
11895   }
11896   if (loop->any_likely_upper_bound)
11897     loop->nb_iterations_likely_upper_bound
11898       = (final_iter_may_be_partial
11899          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
11900                           + bias_for_lowest, lowest_vf) - 1
11901          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
11902                            + bias_for_lowest, lowest_vf) - 1);
11903   if (loop->any_estimate)
11904     loop->nb_iterations_estimate
11905       = (final_iter_may_be_partial
11906          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
11907                           assumed_vf) - 1
11908          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
11909                            assumed_vf) - 1);
11910   scale_profile_for_vect_loop (loop, LOOP_VINFO_IV_EXIT (loop_vinfo),
11911                                assumed_vf, flat);
11912
11913   if (dump_enabled_p ())
11914     {
11915       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
11916         {
11917           dump_printf_loc (MSG_NOTE, vect_location,
11918                            "LOOP VECTORIZED\n");
11919           if (loop->inner)
11920             dump_printf_loc (MSG_NOTE, vect_location,
11921                              "OUTER LOOP VECTORIZED\n");
11922           dump_printf (MSG_NOTE, "\n");
11923         }
11924       else
11925         dump_printf_loc (MSG_NOTE, vect_location,
11926                          "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
11927                          GET_MODE_NAME (loop_vinfo->vector_mode));
11928     }
11929
11930   /* Loops vectorized with a variable factor won't benefit from
11931      unrolling/peeling.  */
11932   if (!vf.is_constant ())
11933     {
11934       loop->unroll = 1;
11935       if (dump_enabled_p ())
11936         dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
11937                          " variable-length vectorization factor\n");
11938     }
11939   /* Free SLP instances here because otherwise stmt reference counting
11940      won't work.  */
11941   slp_instance instance;
11942   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
11943     vect_free_slp_instance (instance);
11944   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
11945   /* Clear-up safelen field since its value is invalid after vectorization
11946      since vectorized loop can have loop-carried dependencies.  */
11947   loop->safelen = 0;
11948
11949   if (epilogue)
11950     {
11951       update_epilogue_loop_vinfo (epilogue, advance);
11952
11953       epilogue->simduid = loop->simduid;
11954       epilogue->force_vectorize = loop->force_vectorize;
11955       epilogue->dont_vectorize = false;
11956     }
11957
11958   return epilogue;
11959 }
11960
11961 /* The code below is trying to perform simple optimization - revert
11962    if-conversion for masked stores, i.e. if the mask of a store is zero
11963    do not perform it and all stored value producers also if possible.
11964    For example,
11965      for (i=0; i<n; i++)
11966        if (c[i])
11967         {
11968           p1[i] += 1;
11969           p2[i] = p3[i] +2;
11970         }
11971    this transformation will produce the following semi-hammock:
11972
11973    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
11974      {
11975        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
11976        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
11977        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
11978        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
11979        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
11980        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
11981      }
11982 */
11983
11984 void
11985 optimize_mask_stores (class loop *loop)
11986 {
11987   basic_block *bbs = get_loop_body (loop);
11988   unsigned nbbs = loop->num_nodes;
11989   unsigned i;
11990   basic_block bb;
11991   class loop *bb_loop;
11992   gimple_stmt_iterator gsi;
11993   gimple *stmt;
11994   auto_vec<gimple *> worklist;
11995   auto_purge_vect_location sentinel;
11996
11997   vect_location = find_loop_location (loop);
11998   /* Pick up all masked stores in loop if any.  */
11999   for (i = 0; i < nbbs; i++)
12000     {
12001       bb = bbs[i];
12002       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
12003            gsi_next (&gsi))
12004         {
12005           stmt = gsi_stmt (gsi);
12006           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
12007             worklist.safe_push (stmt);
12008         }
12009     }
12010
12011   free (bbs);
12012   if (worklist.is_empty ())
12013     return;
12014
12015   /* Loop has masked stores.  */
12016   while (!worklist.is_empty ())
12017     {
12018       gimple *last, *last_store;
12019       edge e, efalse;
12020       tree mask;
12021       basic_block store_bb, join_bb;
12022       gimple_stmt_iterator gsi_to;
12023       tree vdef, new_vdef;
12024       gphi *phi;
12025       tree vectype;
12026       tree zero;
12027
12028       last = worklist.pop ();
12029       mask = gimple_call_arg (last, 2);
12030       bb = gimple_bb (last);
12031       /* Create then_bb and if-then structure in CFG, then_bb belongs to
12032          the same loop as if_bb.  It could be different to LOOP when two
12033          level loop-nest is vectorized and mask_store belongs to the inner
12034          one.  */
12035       e = split_block (bb, last);
12036       bb_loop = bb->loop_father;
12037       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
12038       join_bb = e->dest;
12039       store_bb = create_empty_bb (bb);
12040       add_bb_to_loop (store_bb, bb_loop);
12041       e->flags = EDGE_TRUE_VALUE;
12042       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
12043       /* Put STORE_BB to likely part.  */
12044       efalse->probability = profile_probability::likely ();
12045       e->probability = efalse->probability.invert ();
12046       store_bb->count = efalse->count ();
12047       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
12048       if (dom_info_available_p (CDI_DOMINATORS))
12049         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
12050       if (dump_enabled_p ())
12051         dump_printf_loc (MSG_NOTE, vect_location,
12052                          "Create new block %d to sink mask stores.",
12053                          store_bb->index);
12054       /* Create vector comparison with boolean result.  */
12055       vectype = TREE_TYPE (mask);
12056       zero = build_zero_cst (vectype);
12057       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
12058       gsi = gsi_last_bb (bb);
12059       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
12060       /* Create new PHI node for vdef of the last masked store:
12061          .MEM_2 = VDEF <.MEM_1>
12062          will be converted to
12063          .MEM.3 = VDEF <.MEM_1>
12064          and new PHI node will be created in join bb
12065          .MEM_2 = PHI <.MEM_1, .MEM_3>
12066       */
12067       vdef = gimple_vdef (last);
12068       new_vdef = make_ssa_name (gimple_vop (cfun), last);
12069       gimple_set_vdef (last, new_vdef);
12070       phi = create_phi_node (vdef, join_bb);
12071       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
12072
12073       /* Put all masked stores with the same mask to STORE_BB if possible.  */
12074       while (true)
12075         {
12076           gimple_stmt_iterator gsi_from;
12077           gimple *stmt1 = NULL;
12078
12079           /* Move masked store to STORE_BB.  */
12080           last_store = last;
12081           gsi = gsi_for_stmt (last);
12082           gsi_from = gsi;
12083           /* Shift GSI to the previous stmt for further traversal.  */
12084           gsi_prev (&gsi);
12085           gsi_to = gsi_start_bb (store_bb);
12086           gsi_move_before (&gsi_from, &gsi_to);
12087           /* Setup GSI_TO to the non-empty block start.  */
12088           gsi_to = gsi_start_bb (store_bb);
12089           if (dump_enabled_p ())
12090             dump_printf_loc (MSG_NOTE, vect_location,
12091                              "Move stmt to created bb\n%G", last);
12092           /* Move all stored value producers if possible.  */
12093           while (!gsi_end_p (gsi))
12094             {
12095               tree lhs;
12096               imm_use_iterator imm_iter;
12097               use_operand_p use_p;
12098               bool res;
12099
12100               /* Skip debug statements.  */
12101               if (is_gimple_debug (gsi_stmt (gsi)))
12102                 {
12103                   gsi_prev (&gsi);
12104                   continue;
12105                 }
12106               stmt1 = gsi_stmt (gsi);
12107               /* Do not consider statements writing to memory or having
12108                  volatile operand.  */
12109               if (gimple_vdef (stmt1)
12110                   || gimple_has_volatile_ops (stmt1))
12111                 break;
12112               gsi_from = gsi;
12113               gsi_prev (&gsi);
12114               lhs = gimple_get_lhs (stmt1);
12115               if (!lhs)
12116                 break;
12117
12118               /* LHS of vectorized stmt must be SSA_NAME.  */
12119               if (TREE_CODE (lhs) != SSA_NAME)
12120                 break;
12121
12122               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12123                 {
12124                   /* Remove dead scalar statement.  */
12125                   if (has_zero_uses (lhs))
12126                     {
12127                       gsi_remove (&gsi_from, true);
12128                       continue;
12129                     }
12130                 }
12131
12132               /* Check that LHS does not have uses outside of STORE_BB.  */
12133               res = true;
12134               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
12135                 {
12136                   gimple *use_stmt;
12137                   use_stmt = USE_STMT (use_p);
12138                   if (is_gimple_debug (use_stmt))
12139                     continue;
12140                   if (gimple_bb (use_stmt) != store_bb)
12141                     {
12142                       res = false;
12143                       break;
12144                     }
12145                 }
12146               if (!res)
12147                 break;
12148
12149               if (gimple_vuse (stmt1)
12150                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
12151                 break;
12152
12153               /* Can move STMT1 to STORE_BB.  */
12154               if (dump_enabled_p ())
12155                 dump_printf_loc (MSG_NOTE, vect_location,
12156                                  "Move stmt to created bb\n%G", stmt1);
12157               gsi_move_before (&gsi_from, &gsi_to);
12158               /* Shift GSI_TO for further insertion.  */
12159               gsi_prev (&gsi_to);
12160             }
12161           /* Put other masked stores with the same mask to STORE_BB.  */
12162           if (worklist.is_empty ()
12163               || gimple_call_arg (worklist.last (), 2) != mask
12164               || worklist.last () != stmt1)
12165             break;
12166           last = worklist.pop ();
12167         }
12168       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
12169     }
12170 }
12171
12172 /* Decide whether it is possible to use a zero-based induction variable
12173    when vectorizing LOOP_VINFO with partial vectors.  If it is, return
12174    the value that the induction variable must be able to hold in order
12175    to ensure that the rgroups eventually have no active vector elements.
12176    Return -1 otherwise.  */
12177
12178 widest_int
12179 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
12180 {
12181   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
12182   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
12183   unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
12184
12185   /* Calculate the value that the induction variable must be able
12186      to hit in order to ensure that we end the loop with an all-false mask.
12187      This involves adding the maximum number of inactive trailing scalar
12188      iterations.  */
12189   widest_int iv_limit = -1;
12190   if (max_loop_iterations (loop, &iv_limit))
12191     {
12192       if (niters_skip)
12193         {
12194           /* Add the maximum number of skipped iterations to the
12195              maximum iteration count.  */
12196           if (TREE_CODE (niters_skip) == INTEGER_CST)
12197             iv_limit += wi::to_widest (niters_skip);
12198           else
12199             iv_limit += max_vf - 1;
12200         }
12201       else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
12202         /* Make a conservatively-correct assumption.  */
12203         iv_limit += max_vf - 1;
12204
12205       /* IV_LIMIT is the maximum number of latch iterations, which is also
12206          the maximum in-range IV value.  Round this value down to the previous
12207          vector alignment boundary and then add an extra full iteration.  */
12208       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
12209       iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
12210     }
12211   return iv_limit;
12212 }
12213
12214 /* For the given rgroup_controls RGC, check whether an induction variable
12215    would ever hit a value that produces a set of all-false masks or zero
12216    lengths before wrapping around.  Return true if it's possible to wrap
12217    around before hitting the desirable value, otherwise return false.  */
12218
12219 bool
12220 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
12221 {
12222   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
12223
12224   if (iv_limit == -1)
12225     return true;
12226
12227   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
12228   unsigned int compare_precision = TYPE_PRECISION (compare_type);
12229   unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
12230
12231   if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
12232     return true;
12233
12234   return false;
12235 }